diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 41b2ef7a528..b2e1c37bc44 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -97,6 +97,20 @@ See the [style doc](https://github.com/rust-lang/fmt-rfcs/blob/master/guide/guid Please follow this style to make TiKV easy to review, maintain, and develop. +### Run test in docker + +Alternatively, you can run test in a docker environment. Simply running the following command, it will build the pingcap/tikv_dev image and run the tikv unittests. And you may re-use the pingcap/tikv_dev image directly for ad-hoc test. + +```bash +make docker_test +``` + +Note that you may find many messages below, which in fact are not errors. They're emitted by rustc or cargo. + +```bash +: Invalid conf pair: prof:true +``` + ### Build issues To reduce compilation time and disk usage, TiKV builds do not include full debugging information by default — only tests package will have line debug info enabled. To change debuginfo, just precede build commands with `RUSTFLAGS=-Cdebuginfo=1` (for line numbers), or `RUSTFLAGS=-Cdebuginfo=2` (for full debuginfo). For example, diff --git a/Cargo.lock b/Cargo.lock index 7b7d6af725c..a0934490dce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2885,7 +2885,7 @@ dependencies = [ [[package]] name = "kvproto" version = "0.0.2" -source = "git+https://github.com/pingcap/kvproto.git#10e7620a630db63d769503ba99c7389f19fb6516" +source = "git+https://github.com/pingcap/kvproto.git?branch=release-7.1#0af0becd87ae89dfd48ef313a4fca7e04688b2b2" dependencies = [ "futures 0.3.15", "grpcio", @@ -5940,6 +5940,7 @@ dependencies = [ "dashmap", "encryption", "engine_rocks", + "engine_test", "engine_traits", "error_code", "external_storage_export", @@ -5954,6 +5955,7 @@ dependencies = [ "online_config", "openssl", "prometheus", + "protobuf", "rand 0.8.5", "serde", "serde_derive", diff --git a/Cargo.toml b/Cargo.toml index 113518fc84c..42725dae955 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -388,7 +388,7 @@ txn_types = { path = "components/txn_types" } grpcio = { version = "0.10.4", default-features = false, features = ["openssl-vendored", "protobuf-codec", "nightly"] } grpcio-health = { version = "0.10.4", default-features = false, features = ["protobuf-codec"] } tipb = { git = "https://github.com/pingcap/tipb.git" } -kvproto = { git = "https://github.com/pingcap/kvproto.git" } +kvproto = { git = "https://github.com/pingcap/kvproto.git", branch = "release-7.1" } yatp = { git = "https://github.com/tikv/yatp.git", branch = "master" } tokio-timer = { git = "https://github.com/tikv/tokio", branch = "tokio-timer-hotfix" } slog = { version = "2.3", features = ["max_level_trace", "release_max_level_debug"] } diff --git a/Dockerfile.test b/Dockerfile.test new file mode 100644 index 00000000000..da23a7a30b6 --- /dev/null +++ b/Dockerfile.test @@ -0,0 +1,57 @@ +# This Docker image contains a minimal build environment for TiKV +# +# It contains all the tools necessary to reproduce official production builds of TiKV + +# We need to use CentOS 7 because many of our users choose this as their deploy machine. +# Since the glibc it uses (2.17) is from 2012 (https://sourceware.org/glibc/wiki/Glibc%20Timeline) +# it is our lowest common denominator in terms of distro support. + +# Some commands in this script are structured in order to reduce the number of layers Docker +# generates. Unfortunately Docker is limited to only 125 layers: +# https://github.com/moby/moby/blob/a9507c6f76627fdc092edc542d5a7ef4a6df5eec/layer/layer.go#L50-L53 + +# We require epel packages, so enable the fedora EPEL repo then install dependencies. +# Install the system dependencies +# Attempt to clean and rebuild the cache to avoid 404s + +# To avoid rebuilds we first install all Cargo dependencies + + +# The prepare image avoid ruining the cache of the builder +FROM centos:7.6.1810 as builder + +RUN yum install -y epel-release && \ + yum clean all && \ + yum makecache + +RUN yum install -y centos-release-scl && \ + yum install -y \ + devtoolset-8 \ + perl cmake3 && \ + yum clean all + +# CentOS gives cmake 3 a weird binary name, so we link it to something more normal +# This is required by many build scripts, including ours. +RUN ln -s /usr/bin/cmake3 /usr/bin/cmake +ENV LIBRARY_PATH /usr/local/lib:$LIBRARY_PATH +ENV LD_LIBRARY_PATH /usr/local/lib:$LD_LIBRARY_PATH + +# Install protoc +RUN curl -LO "https://github.com/protocolbuffers/protobuf/releases/download/v3.15.8/protoc-3.15.8-linux-x86_64.zip" +RUN unzip protoc-3.15.8-linux-x86_64.zip -d /usr/local/ +ENV PATH /usr/local/bin/:$PATH + +# Install Rustup +RUN curl https://sh.rustup.rs -sSf | sh -s -- --no-modify-path --default-toolchain none -y +ENV PATH /root/.cargo/bin/:$PATH + +# Install the Rust toolchain +WORKDIR /tikv +COPY rust-toolchain ./ +RUN rustup self update \ + && rustup set profile minimal \ + && rustup default $(cat "rust-toolchain") + +RUN cargo install cargo-nextest --locked + +ENTRYPOINT ["sh", "-c", "source /opt/rh/devtoolset-8/enable && \"$@\"", "-s"] diff --git a/Makefile b/Makefile index 8c595643828..566108057ab 100644 --- a/Makefile +++ b/Makefile @@ -137,6 +137,7 @@ export PROXY_BUILD_GIT_BRANCH ?= $(shell git rev-parse --abbrev-ref HEAD 2> /dev export DOCKER_IMAGE_NAME ?= "pingcap/tikv" export DOCKER_IMAGE_TAG ?= "latest" +export DEV_DOCKER_IMAGE_NAME ?= "pingcap/tikv_dev" # Turn on cargo pipelining to add more build parallelism. This has shown decent # speedups in TiKV. diff --git a/cmd/tikv-ctl/src/executor.rs b/cmd/tikv-ctl/src/executor.rs index df095e44425..a0d4a039d2c 100644 --- a/cmd/tikv-ctl/src/executor.rs +++ b/cmd/tikv-ctl/src/executor.rs @@ -1,13 +1,15 @@ // Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. use std::{ - borrow::ToOwned, cmp::Ordering, pin::Pin, str, string::ToString, sync::Arc, time::Duration, u64, + borrow::ToOwned, cmp::Ordering, path::Path, pin::Pin, str, string::ToString, sync::Arc, + time::Duration, u64, }; use encryption_export::data_key_manager_from_config; use engine_rocks::util::{db_exist, new_engine_opt}; use engine_traits::{ - Engines, Error as EngineError, RaftEngine, ALL_CFS, CF_DEFAULT, CF_LOCK, CF_WRITE, DATA_CFS, + Engines, Error as EngineError, RaftEngine, TabletRegistry, ALL_CFS, CF_DEFAULT, CF_LOCK, + CF_WRITE, DATA_CFS, }; use futures::{executor::block_on, future, stream, Stream, StreamExt, TryStreamExt}; use grpcio::{ChannelBuilder, Environment}; @@ -25,12 +27,16 @@ use raft_log_engine::RaftLogEngine; use raftstore::store::{util::build_key_range, INIT_EPOCH_CONF_VER}; use security::SecurityManager; use serde_json::json; +use server::fatal; +use slog_global::crit; use tikv::{ config::{ConfigController, TikvConfig}, server::{ - debug::{BottommostLevelCompaction, Debugger, RegionInfo}, + debug::{BottommostLevelCompaction, Debugger, DebuggerImpl, RegionInfo}, + debug2::DebuggerImplV2, KvEngineFactoryBuilder, }, + storage::config::EngineType, }; use tikv_util::escape; @@ -72,13 +78,10 @@ pub fn new_debug_executor( let factory = KvEngineFactoryBuilder::new(env.clone(), cfg, cache) .lite(true) .build(); - let kv_db = match factory.create_shared_db(data_dir) { - Ok(db) => db, - Err(e) => handle_engine_error(e), - }; let cfg_controller = ConfigController::default(); if !cfg.raft_engine.enable { + assert_eq!(EngineType::RaftKv, cfg.storage.engine); let raft_db_opts = cfg.raftdb.build_opt(env, None); let raft_db_cf_opts = cfg.raftdb.build_cf_opts(factory.block_cache()); let raft_path = cfg.infer_raft_db_path(Some(data_dir)).unwrap(); @@ -90,7 +93,13 @@ pub fn new_debug_executor( Ok(db) => db, Err(e) => handle_engine_error(e), }; - let debugger = Debugger::new(Engines::new(kv_db, raft_db), cfg_controller); + + let kv_db = match factory.create_shared_db(data_dir) { + Ok(db) => db, + Err(e) => handle_engine_error(e), + }; + + let debugger = DebuggerImpl::new(Engines::new(kv_db, raft_db), cfg_controller); Box::new(debugger) as Box } else { let mut config = cfg.raft_engine.config(); @@ -100,8 +109,24 @@ pub fn new_debug_executor( tikv_util::logger::exit_process_gracefully(-1); } let raft_db = RaftLogEngine::new(config, key_manager, None /* io_rate_limiter */).unwrap(); - let debugger = Debugger::new(Engines::new(kv_db, raft_db), cfg_controller); - Box::new(debugger) as Box + match cfg.storage.engine { + EngineType::RaftKv => { + let kv_db = match factory.create_shared_db(data_dir) { + Ok(db) => db, + Err(e) => handle_engine_error(e), + }; + + let debugger = DebuggerImpl::new(Engines::new(kv_db, raft_db), cfg_controller); + Box::new(debugger) as Box + } + EngineType::RaftKv2 => { + let registry = + TabletRegistry::new(Box::new(factory), Path::new(data_dir).join("tablets")) + .unwrap_or_else(|e| fatal!("failed to create tablet registry {:?}", e)); + let debugger = DebuggerImplV2::new(registry, raft_db, cfg_controller); + Box::new(debugger) as Box + } + } } } @@ -869,11 +894,11 @@ impl DebugExecutor for DebugClient { } } -impl DebugExecutor for Debugger { +impl DebugExecutor for DebuggerImpl { fn check_local_mode(&self) {} fn get_all_regions_in_store(&self) -> Vec { - self.get_all_regions_in_store() + Debugger::get_all_regions_in_store(self) .unwrap_or_else(|e| perror_and_exit("Debugger::get_all_regions_in_store", e)) } @@ -929,7 +954,7 @@ impl DebugExecutor for Debugger { threads: u32, bottommost: BottommostLevelCompaction, ) { - self.compact(db, cf, from, to, threads, bottommost) + Debugger::compact(self, db, cf, from, to, threads, bottommost) .unwrap_or_else(|e| perror_and_exit("Debugger::compact", e)); } @@ -973,7 +998,7 @@ impl DebugExecutor for Debugger { } fn recover_all(&self, threads: usize, read_only: bool) { - Debugger::recover_all(self, threads, read_only) + DebuggerImpl::recover_all(self, threads, read_only) .unwrap_or_else(|e| perror_and_exit("Debugger::recover all", e)); } @@ -1117,3 +1142,129 @@ fn handle_engine_error(err: EngineError) -> ! { tikv_util::logger::exit_process_gracefully(-1); } + +impl DebugExecutor for DebuggerImplV2 { + fn check_local_mode(&self) {} + + fn get_all_regions_in_store(&self) -> Vec { + Debugger::get_all_regions_in_store(self) + .unwrap_or_else(|e| perror_and_exit("Debugger::get_all_regions_in_store", e)) + } + + fn get_value_by_key(&self, cf: &str, key: Vec) -> Vec { + self.get(DbType::Kv, cf, &key) + .unwrap_or_else(|e| perror_and_exit("Debugger::get", e)) + } + + fn get_region_size(&self, region: u64, cfs: Vec<&str>) -> Vec<(String, usize)> { + self.region_size(region, cfs) + .unwrap_or_else(|e| perror_and_exit("Debugger::region_size", e)) + .into_iter() + .map(|(cf, size)| (cf.to_owned(), size)) + .collect() + } + + fn get_region_info(&self, region: u64) -> RegionInfo { + self.region_info(region) + .unwrap_or_else(|e| perror_and_exit("Debugger::region_info", e)) + } + + fn get_raft_log(&self, region: u64, index: u64) -> Entry { + self.raft_log(region, index) + .unwrap_or_else(|e| perror_and_exit("Debugger::raft_log", e)) + } + + fn get_mvcc_infos(&self, from: Vec, to: Vec, limit: u64) -> MvccInfoStream { + let iter = self + .scan_mvcc(&from, &to, limit) + .unwrap_or_else(|e| perror_and_exit("Debugger::scan_mvcc", e)); + let stream = stream::iter(iter).map_err(|e| e.to_string()); + Box::pin(stream) + } + + fn raw_scan_impl(&self, _from_key: &[u8], _end_key: &[u8], _limit: usize, _cf: &str) { + unimplemented!() + } + + fn do_compaction( + &self, + db: DbType, + cf: &str, + from: &[u8], + to: &[u8], + threads: u32, + bottommost: BottommostLevelCompaction, + ) { + Debugger::compact(self, db, cf, from, to, threads, bottommost) + .unwrap_or_else(|e| perror_and_exit("Debugger::compact", e)); + } + + fn set_region_tombstone(&self, _regions: Vec) { + unimplemented!() + } + + fn set_region_tombstone_by_id(&self, _regions: Vec) { + unimplemented!() + } + + fn recover_regions(&self, _regions: Vec, _read_only: bool) { + unimplemented!() + } + + fn recover_all(&self, _threads: usize, _read_only: bool) { + unimplemented!() + } + + fn print_bad_regions(&self) { + unimplemented!() + } + + fn remove_fail_stores( + &self, + _store_ids: Vec, + _region_ids: Option>, + _promote_learner: bool, + ) { + unimplemented!() + } + + fn drop_unapplied_raftlog(&self, _region_ids: Option>) { + unimplemented!() + } + + fn recreate_region(&self, _sec_mgr: Arc, _pd_cfg: &PdConfig, _region_id: u64) { + unimplemented!() + } + + fn dump_metrics(&self, _tags: Vec<&str>) { + unimplemented!() + } + + fn check_region_consistency(&self, _: u64) { + unimplemented!() + } + + fn modify_tikv_config(&self, _config_name: &str, _config_value: &str) { + unimplemented!() + } + + fn dump_region_properties(&self, _region_id: u64) { + unimplemented!() + } + + fn dump_range_properties(&self, _start: Vec, _end: Vec) { + unimplemented!() + } + + fn dump_store_info(&self) { + unimplemented!() + } + + fn dump_cluster_info(&self) { + unimplemented!() + } + + fn reset_to_version(&self, _version: u64) { + unimplemented!() + } +} diff --git a/components/cdc/src/delegate.rs b/components/cdc/src/delegate.rs index c4212c426be..adca54dace0 100644 --- a/components/cdc/src/delegate.rs +++ b/components/cdc/src/delegate.rs @@ -39,6 +39,7 @@ use crate::{ metrics::*, old_value::{OldValueCache, OldValueCallback}, service::ConnId, + txn_source::TxnSource, Error, Result, }; @@ -550,8 +551,10 @@ impl Delegate { row_size = 0; } } - // if the `txn_source` is not 0 and we should filter it out, skip this event. - if row.txn_source != 0 && filter_loop { + let lossy_ddl_filter = TxnSource::is_lossy_ddl_reorg_source_set(row.txn_source); + let cdc_write_filter = + TxnSource::is_cdc_write_source_set(row.txn_source) && filter_loop; + if lossy_ddl_filter || cdc_write_filter { continue; } if current_rows_size + row_size >= CDC_EVENT_MAX_BYTES { @@ -648,6 +651,14 @@ impl Delegate { return Ok(()); } + // Filter the entries which are lossy DDL events. + // We don't need to send them to downstream. + let entries = entries + .iter() + .filter(|x| !TxnSource::is_lossy_ddl_reorg_source_set(x.txn_source)) + .cloned() + .collect::>(); + let downstreams = self.downstreams(); assert!( !downstreams.is_empty(), @@ -655,15 +666,15 @@ impl Delegate { self.region_id ); - // collect the change event cause by user write, which is `txn_source` = 0. - // for changefeed which only need the user write, send the `filtered`, or else, - // send them all. + // Collect the change event cause by user write, which cdc write source is not + // set. For changefeed which only need the user write, + // send the `filtered_entries`, or else, send them all. let mut filtered_entries = None; for downstream in downstreams { if downstream.filter_loop { let filtered = entries .iter() - .filter(|x| x.txn_source == 0) + .filter(|x| !TxnSource::is_cdc_write_source_set(x.txn_source)) .cloned() .collect::>(); if !filtered.is_empty() { @@ -692,9 +703,11 @@ impl Delegate { } else { downstream.observed_range.filter_entries(entries.clone()) }; + if entries_clone.is_empty() { return Ok(()); } + let event = Event { region_id, index, @@ -1468,6 +1481,107 @@ mod tests { assert_eq!(e.events[0].get_entries().get_entries().len(), 2, "{:?}", e); } + fn test_downstream_txn_source_filter(txn_source: TxnSource, filter_loop: bool) { + // Create a new delegate that observes [a, f). + let observed_range = ObservedRange::new( + Key::from_raw(b"a").into_encoded(), + Key::from_raw(b"f").into_encoded(), + ) + .unwrap(); + let txn_extra_op = Arc::new(AtomicCell::new(TxnExtraOp::Noop)); + let mut delegate = Delegate::new(1, txn_extra_op); + assert!(delegate.handle.is_observing()); + + let mut map = HashMap::default(); + for k in b'a'..=b'e' { + let mut put = PutRequest::default(); + put.key = Key::from_raw(&[k]).into_encoded(); + put.cf = "lock".to_owned(); + let mut lock = Lock::new( + LockType::Put, + put.key.clone(), + 1.into(), + 10, + None, + TimeStamp::zero(), + 0, + TimeStamp::zero(), + ); + // Only the key `a` is a normal write. + if k != b'a' { + lock = lock.set_txn_source(txn_source.into()); + } + put.value = lock.to_bytes(); + delegate + .sink_txn_put( + put, + false, + &mut map, + |_: &mut EventRow, _: TimeStamp| Ok(()), + ) + .unwrap(); + } + assert_eq!(map.len(), 5); + + let (sink, mut drain) = channel(1, MemoryQuota::new(1024)); + let downstream = Downstream { + id: DownstreamId::new(), + req_id: 1, + conn_id: ConnId::new(), + peer: String::new(), + region_epoch: RegionEpoch::default(), + sink: Some(sink), + state: Arc::new(AtomicCell::new(DownstreamState::Normal)), + kv_api: ChangeDataRequestKvApi::TiDb, + filter_loop, + observed_range, + }; + delegate.add_downstream(downstream); + let entries = map.values().map(|(r, _)| r).cloned().collect(); + delegate + .sink_downstream(entries, 1, ChangeDataRequestKvApi::TiDb) + .unwrap(); + + let (mut tx, mut rx) = futures::channel::mpsc::unbounded(); + let runtime = tokio::runtime::Runtime::new().unwrap(); + runtime.spawn(async move { + drain.forward(&mut tx).await.unwrap(); + }); + let (e, _) = recv_timeout(&mut rx, std::time::Duration::from_secs(5)) + .unwrap() + .unwrap(); + assert_eq!(e.events[0].get_entries().get_entries().len(), 1, "{:?}", e); + } + + #[test] + fn test_downstream_filter_cdc_write_entires() { + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + + test_downstream_txn_source_filter(txn_source, true); + } + + #[test] + fn test_downstream_filter_lossy_ddl_entires() { + let mut txn_source = TxnSource::default(); + txn_source.set_lossy_ddl_reorg_source(1); + test_downstream_txn_source_filter(txn_source, false); + + // With cdr write source and filter loop is false, we should still ignore lossy + // ddl changes. + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + txn_source.set_lossy_ddl_reorg_source(1); + test_downstream_txn_source_filter(txn_source, false); + + // With cdr write source and filter loop is true, we should still ignore some + // events. + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + txn_source.set_lossy_ddl_reorg_source(1); + test_downstream_txn_source_filter(txn_source, true); + } + #[test] fn test_decode_rawkv() { let cases = vec![ diff --git a/components/cdc/src/initializer.rs b/components/cdc/src/initializer.rs index 8f6f8ed38a7..c06b13424ba 100644 --- a/components/cdc/src/initializer.rs +++ b/components/cdc/src/initializer.rs @@ -591,6 +591,7 @@ mod tests { use tokio::runtime::{Builder, Runtime}; use super::*; + use crate::txn_source::TxnSource; struct ReceiverRunnable { tx: Sender, @@ -786,18 +787,16 @@ mod tests { worker.stop(); } - #[test] - fn test_initializer_filter_loop() { + fn test_initializer_txn_source_filter(txn_source: TxnSource, filter_loop: bool) { let mut engine = TestEngineBuilder::new().build_without_cache().unwrap(); let mut total_bytes = 0; - for i in 10..100 { let (k, v) = (&[b'k', i], &[b'v', i]); total_bytes += k.len(); total_bytes += v.len(); let ts = TimeStamp::new(i as _); - must_prewrite_put_with_txn_soucre(&mut engine, k, v, k, ts, 1); + must_prewrite_put_with_txn_soucre(&mut engine, k, v, k, ts, txn_source.into()); } let snap = engine.snapshot(Default::default()).unwrap(); @@ -808,7 +807,7 @@ mod tests { buffer, engine.kv_engine(), ChangeDataRequestKvApi::TiDb, - true, + filter_loop, ); let th = pool.spawn(async move { initializer @@ -833,6 +832,34 @@ mod tests { worker.stop(); } + #[test] + fn test_initializer_cdc_write_filter() { + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + test_initializer_txn_source_filter(txn_source, true); + } + + #[test] + fn test_initializer_lossy_ddl_filter() { + let mut txn_source = TxnSource::default(); + txn_source.set_lossy_ddl_reorg_source(1); + test_initializer_txn_source_filter(txn_source, false); + + // With cdr write source and filter loop is false, we should still ignore lossy + // ddl changes. + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + txn_source.set_lossy_ddl_reorg_source(1); + test_initializer_txn_source_filter(txn_source, false); + + // With cdr write source and filter loop is true, we should still ignore all + // events. + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + txn_source.set_lossy_ddl_reorg_source(1); + test_initializer_txn_source_filter(txn_source, true); + } + // Test `hint_min_ts` works fine with `ExtraOp::ReadOldValue`. // Whether `DeltaScanner` emits correct old values or not is already tested by // another case `test_old_value_with_hint_min_ts`, so here we only care about diff --git a/components/cdc/src/lib.rs b/components/cdc/src/lib.rs index 7d63bf5c115..c913cefb92e 100644 --- a/components/cdc/src/lib.rs +++ b/components/cdc/src/lib.rs @@ -13,6 +13,7 @@ pub mod metrics; mod observer; mod old_value; mod service; +mod txn_source; pub use channel::{recv_timeout, CdcEvent, MemoryQuota}; pub use config::CdcConfigManager; diff --git a/components/cdc/src/txn_source.rs b/components/cdc/src/txn_source.rs new file mode 100644 index 00000000000..81dc9f95096 --- /dev/null +++ b/components/cdc/src/txn_source.rs @@ -0,0 +1,116 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +// The bitmap: +// |RESERVED|LOSSY_DDL_REORG_SOURCE_BITS|CDC_WRITE_SOURCE_BITS| +// | 48 | 8 | 4(RESERVED) | 4 | +// +// TiCDC uses 1 - 255 to indicate the source of TiDB. +// For now, 1 - 15 are reserved for TiCDC to implement BDR synchronization. +// 16 - 255 are reserved for extendability. +const CDC_WRITE_SOURCE_BITS: u64 = 8; +const CDC_WRITE_SOURCE_MAX: u64 = (1 << CDC_WRITE_SOURCE_BITS) - 1; + +// TiCDC uses 1-255 to indicate the change from a lossy DDL reorg Backfill job. +// For now, we only use 1 for column reorg backfill job. +#[cfg(test)] +const LOSSY_DDL_REORG_SOURCE_BITS: u64 = 8; +#[cfg(test)] +const LOSSY_DDL_COLUMN_REORG_SOURCE: u64 = 1; +#[cfg(test)] +const LOSSY_DDL_REORG_SOURCE_MAX: u64 = (1 << LOSSY_DDL_REORG_SOURCE_BITS) - 1; +const LOSSY_DDL_REORG_SOURCE_SHIFT: u64 = CDC_WRITE_SOURCE_BITS; + +/// For kv.TxnSource +/// We use an uint64 to represent the source of a transaction. +/// The first 8 bits are reserved for TiCDC, and the next 8 bits are reserved +/// for Lossy DDL reorg Backfill job. The remaining 48 bits are reserved for +/// extendability. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)] +pub(crate) struct TxnSource(u64); + +impl TxnSource { + #[cfg(test)] + pub(crate) fn set_cdc_write_source(&mut self, value: u64) { + if value > CDC_WRITE_SOURCE_MAX { + unreachable!("Only use it in tests") + } + self.0 |= value; + } + + #[cfg(test)] + pub(crate) fn get_cdc_write_source(&self) -> u64 { + self.0 & CDC_WRITE_SOURCE_MAX + } + + pub(crate) fn is_cdc_write_source_set(txn_source: u64) -> bool { + (txn_source & CDC_WRITE_SOURCE_MAX) != 0 + } + + #[cfg(test)] + pub(crate) fn set_lossy_ddl_reorg_source(&mut self, value: u64) { + if value > LOSSY_DDL_REORG_SOURCE_MAX { + unreachable!("Only use it in tests") + } + self.0 |= value << LOSSY_DDL_REORG_SOURCE_SHIFT; + } + + #[cfg(test)] + pub(crate) fn get_lossy_ddl_reorg_source(&self) -> u64 { + (self.0 >> LOSSY_DDL_REORG_SOURCE_SHIFT) & LOSSY_DDL_REORG_SOURCE_MAX + } + + pub(crate) fn is_lossy_ddl_reorg_source_set(txn_source: u64) -> bool { + (txn_source >> LOSSY_DDL_REORG_SOURCE_SHIFT) != 0 + } +} + +impl From for u64 { + fn from(val: TxnSource) -> Self { + val.0 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_cdc_write_source() { + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + assert_eq!(txn_source.get_cdc_write_source(), 1); + } + + #[test] + fn test_is_cdc_write_source_set() { + let mut txn_source = TxnSource::default(); + txn_source.set_cdc_write_source(1); + assert_eq!(TxnSource::is_cdc_write_source_set(txn_source.0), true); + + let txn_source = TxnSource::default(); + assert_eq!(TxnSource::is_cdc_write_source_set(txn_source.0), false); + } + + #[test] + fn test_get_lossy_ddl_reorg_source() { + let mut txn_source = TxnSource::default(); + txn_source.set_lossy_ddl_reorg_source(LOSSY_DDL_COLUMN_REORG_SOURCE); + assert_eq!( + txn_source.get_lossy_ddl_reorg_source(), + LOSSY_DDL_COLUMN_REORG_SOURCE + ); + } + + #[test] + fn test_is_lossy_ddl_reorg_source_set() { + let mut txn_source = TxnSource::default(); + txn_source.set_lossy_ddl_reorg_source(LOSSY_DDL_COLUMN_REORG_SOURCE); + assert_eq!(TxnSource::is_lossy_ddl_reorg_source_set(txn_source.0), true); + + let txn_source = TxnSource::default(); + assert_eq!( + TxnSource::is_lossy_ddl_reorg_source_set(txn_source.0), + false + ); + } +} diff --git a/components/engine_panic/src/engine.rs b/components/engine_panic/src/engine.rs index d8faf8fee01..b5ce0d1516e 100644 --- a/components/engine_panic/src/engine.rs +++ b/components/engine_panic/src/engine.rs @@ -22,7 +22,7 @@ impl KvEngine for PanicEngine { fn bad_downcast(&self) -> &T { panic!() } - #[cfg(any(test, feature = "testexport"))] + #[cfg(feature = "testexport")] fn inner_refcount(&self) -> usize { panic!() } diff --git a/components/engine_panic/src/misc.rs b/components/engine_panic/src/misc.rs index 5603bf43c77..027612d588e 100644 --- a/components/engine_panic/src/misc.rs +++ b/components/engine_panic/src/misc.rs @@ -1,6 +1,6 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{DeleteStrategy, MiscExt, Range, Result, StatisticsReporter}; +use engine_traits::{DeleteStrategy, MiscExt, Range, RangeStats, Result, StatisticsReporter}; use crate::engine::PanicEngine; @@ -100,12 +100,7 @@ impl MiscExt for PanicEngine { panic!() } - fn get_range_entries_and_versions( - &self, - cf: &str, - start: &[u8], - end: &[u8], - ) -> Result> { + fn get_range_stats(&self, cf: &str, start: &[u8], end: &[u8]) -> Result> { panic!() } diff --git a/components/engine_panic/src/sst.rs b/components/engine_panic/src/sst.rs index a0f1479604c..119cd5884a3 100644 --- a/components/engine_panic/src/sst.rs +++ b/components/engine_panic/src/sst.rs @@ -1,6 +1,6 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::{marker::PhantomData, path::PathBuf}; +use std::{marker::PhantomData, path::PathBuf, sync::Arc}; use engine_traits::{ CfName, ExternalSstFileInfo, IterOptions, Iterable, Iterator, RefIterable, Result, @@ -21,6 +21,12 @@ impl SstReader for PanicSstReader { fn open(path: &str) -> Result { panic!() } + fn open_encrypted( + path: &str, + mgr: Arc, + ) -> Result { + panic!() + } fn verify_checksum(&self) -> Result<()> { panic!() } diff --git a/components/engine_rocks/src/encryption.rs b/components/engine_rocks/src/encryption.rs index 3caf07a0276..99d492c4792 100644 --- a/components/engine_rocks/src/encryption.rs +++ b/components/engine_rocks/src/encryption.rs @@ -31,6 +31,12 @@ pub struct WrappedEncryptionKeyManager { manager: Arc, } +impl WrappedEncryptionKeyManager { + pub fn new(manager: Arc) -> Self { + Self { manager } + } +} + impl DBEncryptionKeyManager for WrappedEncryptionKeyManager { fn get_file(&self, fname: &str) -> Result { self.manager diff --git a/components/engine_rocks/src/engine.rs b/components/engine_rocks/src/engine.rs index 6c6231ca42f..293b74e3bca 100644 --- a/components/engine_rocks/src/engine.rs +++ b/components/engine_rocks/src/engine.rs @@ -195,7 +195,7 @@ impl KvEngine for RocksEngine { e.downcast_ref().expect("bad engine downcast") } - #[cfg(any(test, feature = "testexport"))] + #[cfg(feature = "testexport")] fn inner_refcount(&self) -> usize { Arc::strong_count(&self.db) } diff --git a/components/engine_rocks/src/misc.rs b/components/engine_rocks/src/misc.rs index 8d5bb3d43ef..d4ffa564861 100644 --- a/components/engine_rocks/src/misc.rs +++ b/components/engine_rocks/src/misc.rs @@ -2,7 +2,7 @@ use engine_traits::{ CfNamesExt, DeleteStrategy, ImportExt, IterOptions, Iterable, Iterator, MiscExt, Mutable, - Range, Result, SstWriter, SstWriterBuilder, WriteBatch, WriteBatchExt, + Range, RangeStats, Result, SstWriter, SstWriterBuilder, WriteBatch, WriteBatchExt, }; use rocksdb::Range as RocksRange; use tikv_util::{box_try, keybuilder::KeyBuilder}; @@ -353,15 +353,8 @@ impl MiscExt for RocksEngine { Ok(total) } - fn get_range_entries_and_versions( - &self, - cf: &str, - start: &[u8], - end: &[u8], - ) -> Result> { - Ok(crate::properties::get_range_entries_and_versions( - self, cf, start, end, - )) + fn get_range_stats(&self, cf: &str, start: &[u8], end: &[u8]) -> Result> { + Ok(crate::properties::get_range_stats(self, cf, start, end)) } fn is_stalled_or_stopped(&self) -> bool { diff --git a/components/engine_rocks/src/properties.rs b/components/engine_rocks/src/properties.rs index a95a9aecf7b..d1158ac9c4e 100644 --- a/components/engine_rocks/src/properties.rs +++ b/components/engine_rocks/src/properties.rs @@ -9,7 +9,7 @@ use std::{ }; use api_version::{ApiV2, KeyMode, KvFormat}; -use engine_traits::{raw_ttl::ttl_current_ts, MvccProperties, Range}; +use engine_traits::{raw_ttl::ttl_current_ts, MvccProperties, Range, RangeStats}; use rocksdb::{ DBEntryType, TablePropertiesCollector, TablePropertiesCollectorFactory, TitanBlobIndex, UserCollectedProperties, @@ -530,12 +530,12 @@ impl TablePropertiesCollectorFactory } } -pub fn get_range_entries_and_versions( +pub fn get_range_stats( engine: &crate::RocksEngine, cf: &str, start: &[u8], end: &[u8], -) -> Option<(u64, u64)> { +) -> Option { let range = Range::new(start, end); let collection = match engine.get_properties_of_tables_in_range(cf, &[range]) { Ok(v) => v, @@ -557,8 +557,11 @@ pub fn get_range_entries_and_versions( num_entries += v.num_entries(); props.add(&mvcc); } - - Some((num_entries, props.num_versions)) + Some(RangeStats { + num_entries, + num_versions: props.num_versions, + num_rows: props.num_rows, + }) } #[cfg(test)] @@ -773,10 +776,9 @@ mod tests { let start_keys = keys::data_key(&[]); let end_keys = keys::data_end_key(&[]); - let (entries, versions) = - get_range_entries_and_versions(&db, CF_WRITE, &start_keys, &end_keys).unwrap(); - assert_eq!(entries, (cases.len() * 2) as u64); - assert_eq!(versions, cases.len() as u64); + let range_stats = get_range_stats(&db, CF_WRITE, &start_keys, &end_keys).unwrap(); + assert_eq!(range_stats.num_entries, (cases.len() * 2) as u64); + assert_eq!(range_stats.num_versions, cases.len() as u64); } #[test] diff --git a/components/engine_rocks/src/range_properties.rs b/components/engine_rocks/src/range_properties.rs index 101a004982a..dfc41db5f6e 100644 --- a/components/engine_rocks/src/range_properties.rs +++ b/components/engine_rocks/src/range_properties.rs @@ -9,7 +9,7 @@ use tikv_util::{box_err, box_try, debug, info}; use crate::{ engine::RocksEngine, - properties::{get_range_entries_and_versions, RangeProperties}, + properties::{get_range_stats, RangeProperties}, }; impl RangePropertiesExt for RocksEngine { @@ -27,9 +27,8 @@ impl RangePropertiesExt for RocksEngine { let start = &range.start_key; let end = &range.end_key; - let (_, keys) = - get_range_entries_and_versions(self, CF_WRITE, start, end).unwrap_or_default(); - Ok(keys) + let range_stats = get_range_stats(self, CF_WRITE, start, end).unwrap_or_default(); + Ok(range_stats.num_versions) } fn get_range_approximate_keys_cf( diff --git a/components/engine_rocks/src/sst.rs b/components/engine_rocks/src/sst.rs index 85c30d74a87..145fa9a7bce 100644 --- a/components/engine_rocks/src/sst.rs +++ b/components/engine_rocks/src/sst.rs @@ -3,8 +3,8 @@ use std::{path::PathBuf, sync::Arc}; use engine_traits::{ - Error, ExternalSstFileInfo, IterOptions, Iterator, RefIterable, Result, SstCompressionType, - SstExt, SstMetaInfo, SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, + EncryptionKeyManager, Error, ExternalSstFileInfo, IterOptions, Iterator, RefIterable, Result, + SstCompressionType, SstExt, SstMetaInfo, SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, }; use fail::fail_point; use kvproto::import_sstpb::SstMeta; @@ -13,8 +13,11 @@ use rocksdb::{ EnvOptions, ExternalSstFileInfo as RawExternalSstFileInfo, SequentialFile, SstFileReader, SstFileWriter, DB, }; +use tikv_util::box_err; -use crate::{engine::RocksEngine, options::RocksReadOptions, r2e}; +use crate::{ + encryption::WrappedEncryptionKeyManager, engine::RocksEngine, options::RocksReadOptions, r2e, +}; impl SstExt for RocksEngine { type SstReader = RocksSstReader; @@ -63,6 +66,14 @@ impl SstReader for RocksSstReader { fn open(path: &str) -> Result { Self::open_with_env(path, None) } + fn open_encrypted(path: &str, mgr: Arc) -> Result { + let env = Env::new_key_managed_encrypted_env( + Arc::default(), + WrappedEncryptionKeyManager::new(mgr), + ) + .map_err(|err| Error::Other(box_err!("failed to open encrypted env: {}", err)))?; + Self::open_with_env(path, Some(Arc::new(env))) + } fn verify_checksum(&self) -> Result<()> { self.inner.verify_checksum().map_err(r2e)?; Ok(()) diff --git a/components/engine_traits/src/engine.rs b/components/engine_traits/src/engine.rs index aa90c23b429..b3ee1c93b05 100644 --- a/components/engine_traits/src/engine.rs +++ b/components/engine_traits/src/engine.rs @@ -69,6 +69,6 @@ pub trait KvEngine: /// A method for test to expose inner db refcount in order to make sure a /// full release of engine. - #[cfg(any(test, feature = "testexport"))] + #[cfg(feature = "testexport")] fn inner_refcount(&self) -> usize; } diff --git a/components/engine_traits/src/flush.rs b/components/engine_traits/src/flush.rs index 8b0566f2cfb..d79ee9631ca 100644 --- a/components/engine_traits/src/flush.rs +++ b/components/engine_traits/src/flush.rs @@ -13,10 +13,10 @@ //! be used as the start state. use std::{ - collections::LinkedList, + collections::{HashMap, LinkedList}, sync::{ atomic::{AtomicU64, Ordering}, - Arc, Mutex, + Arc, Mutex, RwLock, }, }; @@ -54,6 +54,47 @@ struct FlushProgress { last_flushed: [u64; DATA_CFS_LEN], } +/// A share state between raftstore and underlying engine. +/// +/// raftstore will update state changes and corresponding sst apply index, when +/// apply ingest sst request, it should ensure the sst can be deleted +/// if the flushed index greater than it . +#[derive(Debug, Clone)] +pub struct SstApplyState { + sst_map: Arc, u64>>>, +} + +impl Default for SstApplyState { + fn default() -> Self { + Self { + sst_map: Arc::new(RwLock::new(HashMap::new())), + } + } +} + +impl SstApplyState { + #[inline] + pub fn registe_ssts(&self, uuids: Vec>, sst_applied_index: u64) { + let mut map = self.sst_map.write().unwrap(); + for uuid in uuids { + map.insert(uuid, sst_applied_index); + } + } + + /// Query the sst applied index. + #[inline] + pub fn sst_applied_index(&self, uuid: &Vec) -> Option { + self.sst_map.read().unwrap().get(uuid).copied() + } + + pub fn delete_ssts(&self, uuids: Vec>) { + let mut map = self.sst_map.write().unwrap(); + for uuid in uuids { + map.remove(&uuid); + } + } +} + /// A share state between raftstore and underlying engine. /// /// raftstore will update state changes and corresponding apply index, when diff --git a/components/engine_traits/src/misc.rs b/components/engine_traits/src/misc.rs index c2d317f529f..1a05a5de374 100644 --- a/components/engine_traits/src/misc.rs +++ b/components/engine_traits/src/misc.rs @@ -54,6 +54,16 @@ pub trait StatisticsReporter { fn flush(&mut self); } +#[derive(Default)] +pub struct RangeStats { + // The number of entries + pub num_entries: u64, + // The number of MVCC versions of all rows (num_entries - tombstones). + pub num_versions: u64, + // The number of rows. + pub num_rows: u64, +} + pub trait MiscExt: CfNamesExt + FlowControlFactorsExt { type StatisticsReporter: StatisticsReporter; @@ -121,12 +131,7 @@ pub trait MiscExt: CfNamesExt + FlowControlFactorsExt { fn get_num_keys(&self) -> Result; - fn get_range_entries_and_versions( - &self, - cf: &str, - start: &[u8], - end: &[u8], - ) -> Result>; + fn get_range_stats(&self, cf: &str, start: &[u8], end: &[u8]) -> Result>; fn is_stalled_or_stopped(&self) -> bool; } diff --git a/components/engine_traits/src/sst.rs b/components/engine_traits/src/sst.rs index ea08df3bb50..4a728df1e97 100644 --- a/components/engine_traits/src/sst.rs +++ b/components/engine_traits/src/sst.rs @@ -1,10 +1,10 @@ // Copyright 2019 TiKV Project Authors. Licensed under Apache-2.0. -use std::path::PathBuf; +use std::{path::PathBuf, sync::Arc}; use kvproto::import_sstpb::SstMeta; -use crate::{errors::Result, RefIterable}; +use crate::{errors::Result, EncryptionKeyManager, RefIterable}; #[derive(Clone, Debug)] pub struct SstMetaInfo { @@ -22,6 +22,7 @@ pub trait SstExt: Sized { /// SstReader is used to read an SST file. pub trait SstReader: RefIterable + Sized { fn open(path: &str) -> Result; + fn open_encrypted(path: &str, mgr: Arc) -> Result; fn verify_checksum(&self) -> Result<()>; } diff --git a/components/error_code/src/storage.rs b/components/error_code/src/storage.rs index e2cf34094c3..8b41e7a797e 100644 --- a/components/error_code/src/storage.rs +++ b/components/error_code/src/storage.rs @@ -43,5 +43,7 @@ define_error_codes!( ASSERTION_FAILED => ("AssertionFailed", "", ""), LOCK_IF_EXISTS_FAILED => ("LockIfExistsFailed", "", ""), + PRIMARY_MISMATCH => ("PrimaryMismatch", "", ""), + UNKNOWN => ("Unknown", "", "") ); diff --git a/components/raftstore-v2/src/batch/store.rs b/components/raftstore-v2/src/batch/store.rs index 1f6245cc010..25fbde2ed27 100644 --- a/components/raftstore-v2/src/batch/store.rs +++ b/components/raftstore-v2/src/batch/store.rs @@ -46,7 +46,7 @@ use tikv_util::{ sys::SysQuota, time::{duration_to_sec, Instant as TiInstant}, timer::SteadyTimer, - worker::{LazyWorker, Scheduler, Worker}, + worker::{Builder, LazyWorker, Scheduler, Worker}, yatp_pool::{DefaultTicker, FuturePool, YatpPoolBuilder}, Either, }; @@ -57,7 +57,7 @@ use crate::{ operation::{SharedReadTablet, MERGE_IN_PROGRESS_PREFIX, MERGE_SOURCE_PREFIX, SPLIT_PREFIX}, raft::Storage, router::{PeerMsg, PeerTick, StoreMsg}, - worker::{pd, tablet}, + worker::{checkpoint, cleanup, pd, tablet}, Error, Result, }; @@ -224,6 +224,16 @@ impl PollHandler>) -> HandleResult { + fail::fail_point!( + "pause_on_peer_collect_message", + fsm.deref_mut().peer().peer_id() == 1, + |_| unreachable!() + ); + fail::fail_point!( + "on_peer_collect_message_2", + fsm.deref_mut().peer().peer_id() == 2, + |_| unreachable!() + ); debug_assert!(self.peer_msg_buf.is_empty()); let batch_size = self.messages_per_tick(); let received_cnt = fsm.recv(&mut self.peer_msg_buf, batch_size); @@ -496,7 +506,9 @@ pub struct Schedulers { pub read: Scheduler>, pub pd: Scheduler, pub tablet: Scheduler>, + pub checkpoint: Scheduler>, pub write: WriteSenders, + pub cleanup: Scheduler, // Following is not maintained by raftstore itself. pub split_check: Scheduler, @@ -518,8 +530,10 @@ struct Workers { async_read: Worker, pd: LazyWorker, tablet: Worker, + checkpoint: Worker, async_write: StoreWriters, purge: Option, + cleanup_worker: Worker, // Following is not maintained by raftstore itself. background: Worker, @@ -527,12 +541,15 @@ struct Workers { impl Workers { fn new(background: Worker, pd: LazyWorker, purge: Option) -> Self { + let checkpoint = Builder::new("checkpoint-worker").thread_count(2).create(); Self { async_read: Worker::new("async-read-worker"), pd, tablet: Worker::new("tablet-worker"), + checkpoint, async_write: StoreWriters::new(None), purge, + cleanup_worker: Worker::new("cleanup-worker"), background, } } @@ -542,6 +559,7 @@ impl Workers { self.async_read.stop(); self.pd.stop(); self.tablet.stop(); + self.checkpoint.stop(); if let Some(w) = self.purge { w.stop(); } @@ -653,7 +671,7 @@ impl StoreSystem { ), ); - let tablet_gc_scheduler = workers.tablet.start_with_timer( + let tablet_scheduler = workers.tablet.start_with_timer( "tablet-worker", tablet::Runner::new( tablet_registry.clone(), @@ -662,12 +680,25 @@ impl StoreSystem { ), ); + let compact_runner = + cleanup::CompactRunner::new(tablet_registry.clone(), self.logger.clone()); + let cleanup_worker_scheduler = workers + .cleanup_worker + .start("cleanup-worker", cleanup::Runner::new(compact_runner)); + + let checkpoint_scheduler = workers.checkpoint.start( + "checkpoint-worker", + checkpoint::Runner::new(self.logger.clone(), tablet_registry.clone()), + ); + let schedulers = Schedulers { read: read_scheduler, pd: workers.pd.scheduler(), - tablet: tablet_gc_scheduler, + tablet: tablet_scheduler, + checkpoint: checkpoint_scheduler, write: workers.async_write.senders(), split_check: split_check_scheduler, + cleanup: cleanup_worker_scheduler, }; let builder = StorePollerBuilder::new( diff --git a/components/raftstore-v2/src/fsm/apply.rs b/components/raftstore-v2/src/fsm/apply.rs index 08d7f7946ec..ff1c5414de3 100644 --- a/components/raftstore-v2/src/fsm/apply.rs +++ b/components/raftstore-v2/src/fsm/apply.rs @@ -7,7 +7,7 @@ use std::{ use batch_system::{Fsm, FsmScheduler, Mailbox}; use crossbeam::channel::TryRecvError; -use engine_traits::{FlushState, KvEngine, TabletRegistry}; +use engine_traits::{FlushState, KvEngine, SstApplyState, TabletRegistry}; use futures::{compat::Future01CompatExt, FutureExt, StreamExt}; use kvproto::{metapb, raft_serverpb::RegionLocalState}; use pd_client::BucketStat; @@ -27,6 +27,7 @@ use crate::{ operation::{CatchUpLogs, DataTrace}, raft::Apply, router::{ApplyRes, ApplyTask, PeerMsg}, + worker::checkpoint, }; /// A trait for reporting apply result. @@ -77,7 +78,9 @@ impl ApplyFsm { res_reporter: R, tablet_registry: TabletRegistry, read_scheduler: Scheduler>, + checkpoint_scheduler: Scheduler>, flush_state: Arc, + sst_apply_state: SstApplyState, log_recovery: Option>, applied_term: u64, buckets: Option, @@ -94,11 +97,13 @@ impl ApplyFsm { tablet_registry, read_scheduler, flush_state, + sst_apply_state, log_recovery, applied_term, buckets, sst_importer, coprocessor_host, + checkpoint_scheduler, logger, ); ( diff --git a/components/raftstore-v2/src/fsm/store.rs b/components/raftstore-v2/src/fsm/store.rs index e9b224b7375..c7f228f7f9c 100644 --- a/components/raftstore-v2/src/fsm/store.rs +++ b/components/raftstore-v2/src/fsm/store.rs @@ -142,6 +142,7 @@ impl StoreRegionMeta for StoreMeta { pub struct Store { id: u64, + last_compact_checked_key: Vec, // Unix time when it's started. start_time: Option, logger: Logger, @@ -151,6 +152,7 @@ impl Store { pub fn new(id: u64, logger: Logger) -> Store { Store { id, + last_compact_checked_key: keys::DATA_MIN_KEY.to_vec(), start_time: None, logger: logger.new(o!("store_id" => id)), } @@ -160,6 +162,14 @@ impl Store { self.id } + pub fn last_compact_checked_key(&self) -> &Vec { + &self.last_compact_checked_key + } + + pub fn set_last_compact_checked_key(&mut self, key: Vec) { + self.last_compact_checked_key = key; + } + pub fn start_time(&self) -> Option { self.start_time } @@ -239,6 +249,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { StoreTick::CleanupImportSst, self.store_ctx.cfg.cleanup_import_sst_interval.0, ); + self.register_compact_check_tick(); } pub fn schedule_tick(&mut self, tick: StoreTick, timeout: Duration) { @@ -263,6 +274,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { match tick { StoreTick::PdStoreHeartbeat => self.on_pd_store_heartbeat(), StoreTick::CleanupImportSst => self.on_cleanup_import_sst(), + StoreTick::CompactCheck => self.on_compact_check_tick(), _ => unimplemented!(), } } diff --git a/components/raftstore-v2/src/lib.rs b/components/raftstore-v2/src/lib.rs index bcfaf383024..697d953e5c8 100644 --- a/components/raftstore-v2/src/lib.rs +++ b/components/raftstore-v2/src/lib.rs @@ -43,6 +43,7 @@ pub use fsm::StoreMeta; pub use operation::{write_initial_states, SimpleWriteBinary, SimpleWriteEncoder, StateStorage}; pub use raftstore::{store::Config, Error, Result}; pub use worker::{ + cleanup::CompactTask, pd::{PdReporter, Task as PdTask}, tablet::Task as TabletTask, }; diff --git a/components/raftstore-v2/src/operation/command/admin/flashback.rs b/components/raftstore-v2/src/operation/command/admin/flashback.rs new file mode 100644 index 00000000000..15d9070de45 --- /dev/null +++ b/components/raftstore-v2/src/operation/command/admin/flashback.rs @@ -0,0 +1,108 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use engine_traits::{KvEngine, RaftEngine, RaftLogBatch}; +use fail::fail_point; +use kvproto::{ + raft_cmdpb::{AdminCmdType, AdminRequest, AdminResponse, RaftCmdRequest}, + raft_serverpb::RegionLocalState, +}; +use protobuf::Message; +use raftstore::{coprocessor::RegionChangeReason, store::metrics::PEER_ADMIN_CMD_COUNTER, Result}; + +use super::AdminCmdResult; +use crate::{ + batch::StoreContext, + fsm::ApplyResReporter, + raft::{Apply, Peer}, +}; + +#[derive(Debug)] +pub struct FlashbackResult { + index: u64, + region_state: RegionLocalState, +} + +impl Peer { + pub fn propose_flashback( + &mut self, + store_ctx: &mut StoreContext, + req: RaftCmdRequest, + ) -> Result { + let data = req.write_to_bytes().unwrap(); + self.propose(store_ctx, data) + } +} + +impl Apply { + pub fn apply_flashback( + &mut self, + index: u64, + req: &AdminRequest, + ) -> Result<(AdminResponse, AdminCmdResult)> { + // Modify flashback fields in region state. + // + // Note: region state is persisted by `Peer::on_apply_res_flashback`. + let region = self.region_state_mut().mut_region(); + match req.get_cmd_type() { + AdminCmdType::PrepareFlashback => { + PEER_ADMIN_CMD_COUNTER.prepare_flashback.success.inc(); + + region.set_is_in_flashback(true); + region.set_flashback_start_ts(req.get_prepare_flashback().get_start_ts()); + } + AdminCmdType::FinishFlashback => { + PEER_ADMIN_CMD_COUNTER.finish_flashback.success.inc(); + + region.set_is_in_flashback(false); + region.clear_flashback_start_ts(); + } + _ => unreachable!(), + } + Ok(( + AdminResponse::default(), + AdminCmdResult::Flashback(FlashbackResult { + index, + region_state: self.region_state().clone(), + }), + )) + } +} + +impl Peer { + // Match v1 on_set_flashback_state. + pub fn on_apply_res_flashback( + &mut self, + store_ctx: &mut StoreContext, + mut res: FlashbackResult, + ) { + (|| { + fail_point!("keep_peer_fsm_flashback_state_false", |_| { + res.region_state.mut_region().set_is_in_flashback(false); + }) + })(); + slog::debug!(self.logger, + "flashback update region"; + "region" => ?res.region_state.get_region()); + let region_id = self.region_id(); + { + let mut meta = store_ctx.store_meta.lock().unwrap(); + meta.set_region(res.region_state.get_region(), true, &self.logger); + let (reader, _) = meta.readers.get_mut(®ion_id).unwrap(); + self.set_region( + &store_ctx.coprocessor_host, + reader, + res.region_state.get_region().clone(), + RegionChangeReason::Flashback, + res.region_state.get_tablet_index(), + ); + } + + self.state_changes_mut() + .put_region_state(region_id, res.index, &res.region_state) + .unwrap(); + self.set_has_extra_write(); + + // Compares to v1, v2 does not expire remote lease, because only + // local reader can serve read requests. + } +} diff --git a/components/raftstore-v2/src/operation/command/admin/mod.rs b/components/raftstore-v2/src/operation/command/admin/mod.rs index 69c9b39aaa2..9f3475a25d3 100644 --- a/components/raftstore-v2/src/operation/command/admin/mod.rs +++ b/components/raftstore-v2/src/operation/command/admin/mod.rs @@ -2,6 +2,7 @@ mod compact_log; mod conf_change; +mod flashback; mod merge; mod split; mod transfer_leader; @@ -39,6 +40,7 @@ pub use split::{ use tikv_util::{box_err, log::SlogFormat}; use txn_types::WriteBatchFlags; +use self::flashback::FlashbackResult; use crate::{ batch::StoreContext, raft::Peer, @@ -56,6 +58,7 @@ pub enum AdminCmdResult { UpdateGcPeers(UpdateGcPeersResult), PrepareMerge(PrepareMergeResult), CommitMerge(CommitMergeResult), + Flashback(FlashbackResult), } impl Peer { @@ -264,7 +267,10 @@ impl Peer { } AdminCmdType::PrepareMerge => self.propose_prepare_merge(ctx, req), AdminCmdType::CommitMerge => self.propose_commit_merge(ctx, req), - _ => unimplemented!(), + AdminCmdType::PrepareFlashback | AdminCmdType::FinishFlashback => { + self.propose_flashback(ctx, req) + } + _ => unimplemented!("{:?}", req), } }; match &res { diff --git a/components/raftstore-v2/src/operation/command/admin/split.rs b/components/raftstore-v2/src/operation/command/admin/split.rs index 4c6fdad3aa2..9dbd27f336d 100644 --- a/components/raftstore-v2/src/operation/command/admin/split.rs +++ b/components/raftstore-v2/src/operation/command/admin/split.rs @@ -25,14 +25,13 @@ //! created by the store, and here init it using the data sent from the parent //! peer. -use std::{any::Any, borrow::Cow, cmp, path::PathBuf}; +use std::{any::Any, borrow::Cow, cmp, path::PathBuf, time::Duration}; use collections::HashSet; use crossbeam::channel::SendError; -use engine_traits::{ - Checkpointer, KvEngine, RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, -}; +use engine_traits::{KvEngine, RaftEngine, RaftLogBatch, TabletContext, TabletRegistry}; use fail::fail_point; +use futures::channel::oneshot; use kvproto::{ metapb::{self, Region, RegionEpoch}, pdpb::CheckPolicy, @@ -54,7 +53,7 @@ use raftstore::{ Result, }; use slog::{error, info, warn}; -use tikv_util::{log::SlogFormat, slog_panic, time::Instant}; +use tikv_util::{log::SlogFormat, slog_panic, time::Instant, worker::Scheduler}; use crate::{ batch::StoreContext, @@ -62,7 +61,7 @@ use crate::{ operation::{AdminCmdResult, SharedReadTablet}, raft::{Apply, Peer}, router::{CmdResChannel, PeerMsg, PeerTick, StoreMsg}, - worker::tablet, + worker::{checkpoint, tablet}, Error, }; @@ -370,7 +369,7 @@ impl Peer { } impl Apply { - pub fn apply_split( + pub async fn apply_split( &mut self, req: &AdminRequest, log_index: u64, @@ -388,10 +387,10 @@ impl Apply { // This method is executed only when there are unapplied entries after being // restarted. So there will be no callback, it's OK to return a response // that does not matched with its request. - self.apply_batch_split(req, log_index) + self.apply_batch_split(req, log_index).await } - pub fn apply_batch_split( + pub async fn apply_batch_split( &mut self, req: &AdminRequest, log_index: u64, @@ -401,6 +400,11 @@ impl Apply { self.peer().get_store_id() == 3, |_| { unreachable!() } ); + fail_point!( + "apply_before_split_1_3", + self.peer_id() == 3 && self.region_id() == 1, + |_| { unreachable!() } + ); PEER_ADMIN_CMD_COUNTER.batch_split.all.inc(); let region = self.region(); @@ -469,65 +473,27 @@ impl Apply { // write batch self.flush(); - // todo(SpadeA): Here: we use a temporary solution that we use checkpoint API to - // clone new tablets. It may cause large jitter as we need to flush the - // memtable. And more what is more important is that after removing WAL, the API - // will never flush. - // We will freeze the memtable rather than flush it in the following PR. - let tablet = self.tablet().clone(); - let mut checkpointer = tablet.new_checkpointer().unwrap_or_else(|e| { - slog_panic!( - self.logger, - "fails to create checkpoint object"; - "error" => ?e - ) - }); - let now = Instant::now(); - let reg = self.tablet_registry(); - for new_region in ®ions { - let new_region_id = new_region.id; - if new_region_id == region_id { - continue; - } - - let split_temp_path = temp_split_path(reg, new_region_id); - checkpointer - .create_at(&split_temp_path, None, 0) - .unwrap_or_else(|e| { - slog_panic!( - self.logger, - "fails to create checkpoint"; - "path" => %split_temp_path.display(), - "error" => ?e - ) - }); - } + let split_region_ids = regions + .iter() + .map(|r| r.get_id()) + .filter(|id| id != ®ion_id) + .collect::>(); + let scheduler: _ = self.checkpoint_scheduler().clone(); + let tablet = self.tablet().clone(); + let checkpoint_duration = + async_checkpoint(tablet, &scheduler, region_id, split_region_ids, log_index).await; - let derived_path = self.tablet_registry().tablet_path(region_id, log_index); - // If it's recovered from restart, it's possible the target path exists already. - // And because checkpoint is atomic, so we don't need to worry about corruption. - // And it's also wrong to delete it and remake as it may has applied and flushed - // some data to the new checkpoint before being restarted. - if !derived_path.exists() { - checkpointer - .create_at(&derived_path, None, 0) - .unwrap_or_else(|e| { - slog_panic!( - self.logger, - "fails to create checkpoint"; - "path" => %derived_path.display(), - "error" => ?e - ) - }); - } + // It should equal to checkpoint_duration + the duration of rescheduling current + // apply peer let elapsed = now.saturating_elapsed(); // to be removed after when it's stable info!( self.logger, - "create checkpoint time consumes"; + "checkpoint done and resume batch split execution"; "region" => ?self.region(), - "duration" => ?elapsed + "checkpoint_duration" => ?checkpoint_duration, + "total_duration" => ?elapsed, ); let reg = self.tablet_registry(); @@ -560,6 +526,27 @@ impl Apply { } } +// asynchronously execute the checkpoint creation and return the duration spent +// by it +async fn async_checkpoint( + tablet: EK, + scheduler: &Scheduler>, + parent_region: u64, + split_regions: Vec, + log_index: u64, +) -> Duration { + let (tx, rx) = oneshot::channel(); + let task = checkpoint::Task::Checkpoint { + tablet, + log_index, + parent_region, + split_regions, + sender: tx, + }; + scheduler.schedule_force(task).unwrap(); + rx.await.unwrap() +} + impl Peer { pub fn on_apply_res_split( &mut self, @@ -865,8 +852,10 @@ mod test { kv::{KvTestEngine, TestTabletFactory}, }; use engine_traits::{ - FlushState, Peekable, TabletContext, TabletRegistry, WriteBatch, CF_DEFAULT, DATA_CFS, + FlushState, Peekable, SstApplyState, TabletContext, TabletRegistry, WriteBatch, CF_DEFAULT, + DATA_CFS, }; + use futures::executor::block_on; use kvproto::{ metapb::RegionEpoch, raft_cmdpb::{BatchSplitRequest, SplitRequest}, @@ -879,8 +868,9 @@ mod test { use slog::o; use tempfile::TempDir; use tikv_util::{ + defer, store::{new_learner_peer, new_peer}, - worker::dummy_scheduler, + worker::{dummy_scheduler, Worker}, }; use super::*; @@ -947,7 +937,8 @@ mod test { req.set_splits(splits); // Exec batch split - let (resp, apply_res) = apply.apply_batch_split(&req, log_index).unwrap(); + let (resp, apply_res) = + block_on(async { apply.apply_batch_split(&req, log_index).await }).unwrap(); let regions = resp.get_splits().get_regions(); assert!(regions.len() == region_boundries.len()); @@ -990,6 +981,11 @@ mod test { assert!(reg.tablet_factory().exists(&path)); } } + + let AdminCmdResult::SplitRegion(SplitResult { tablet, .. }) = apply_res else { panic!() }; + // update cache + let mut cache = apply.tablet_registry().get(parent_id).unwrap(); + cache.set(*tablet.downcast().unwrap()); } #[test] @@ -1020,6 +1016,13 @@ mod test { region_state.set_region(region.clone()); region_state.set_tablet_index(5); + let checkpoint_worker = Worker::new("checkpoint-worker"); + let checkpoint_scheduler = checkpoint_worker.start( + "checkpoint-worker", + checkpoint::Runner::new(logger.clone(), reg.clone()), + ); + defer!(checkpoint_worker.stop()); + let (read_scheduler, _rx) = dummy_scheduler(); let (reporter, _) = MockReporter::new(); let (_tmp_dir, importer) = create_tmp_importer(); @@ -1037,11 +1040,13 @@ mod test { reg, read_scheduler, Arc::new(FlushState::new(5)), + SstApplyState::default(), None, 5, None, importer, host, + checkpoint_scheduler, logger.clone(), ); @@ -1050,13 +1055,13 @@ mod test { splits.mut_requests().push(new_split_req(b"k1", 1, vec![])); let mut req = AdminRequest::default(); req.set_splits(splits.clone()); - let err = apply.apply_batch_split(&req, 0).unwrap_err(); + let err = block_on(async { apply.apply_batch_split(&req, 0).await }).unwrap_err(); // 3 followers are required. assert!(err.to_string().contains("invalid new peer id count")); splits.mut_requests().clear(); req.set_splits(splits.clone()); - let err = apply.apply_batch_split(&req, 6).unwrap_err(); + let err = block_on(async { apply.apply_batch_split(&req, 6).await }).unwrap_err(); // Empty requests should be rejected. assert!(err.to_string().contains("missing split requests")); @@ -1064,7 +1069,9 @@ mod test { .mut_requests() .push(new_split_req(b"k11", 1, vec![11, 12, 13])); req.set_splits(splits.clone()); - let resp = new_error(apply.apply_batch_split(&req, 0).unwrap_err()); + let resp = + new_error(block_on(async { apply.apply_batch_split(&req, 0).await }).unwrap_err()); + // Out of range keys should be rejected. assert!( resp.get_header().get_error().has_key_not_in_region(), @@ -1077,7 +1084,7 @@ mod test { .mut_requests() .push(new_split_req(b"", 1, vec![11, 12, 13])); req.set_splits(splits.clone()); - let err = apply.apply_batch_split(&req, 7).unwrap_err(); + let err = block_on(async { apply.apply_batch_split(&req, 7).await }).unwrap_err(); // Empty key will not in any region exclusively. assert!(err.to_string().contains("missing split key"), "{:?}", err); @@ -1089,7 +1096,7 @@ mod test { .mut_requests() .push(new_split_req(b"k1", 1, vec![11, 12, 13])); req.set_splits(splits.clone()); - let err = apply.apply_batch_split(&req, 8).unwrap_err(); + let err = block_on(async { apply.apply_batch_split(&req, 8).await }).unwrap_err(); // keys should be in ascend order. assert!( err.to_string().contains("invalid split request"), @@ -1105,7 +1112,7 @@ mod test { .mut_requests() .push(new_split_req(b"k2", 1, vec![11, 12])); req.set_splits(splits.clone()); - let err = apply.apply_batch_split(&req, 9).unwrap_err(); + let err = block_on(async { apply.apply_batch_split(&req, 9).await }).unwrap_err(); // All requests should be checked. assert!(err.to_string().contains("id count"), "{:?}", err); @@ -1223,7 +1230,7 @@ mod test { .mut_requests() .push(new_split_req(b"k05", 70, vec![71, 72, 73])); req.set_splits(splits); - apply.apply_batch_split(&req, 51).unwrap(); + block_on(async { apply.apply_batch_split(&req, 51).await }).unwrap(); assert!(apply.write_batch.is_none()); assert_eq!( apply diff --git a/components/raftstore-v2/src/operation/command/control.rs b/components/raftstore-v2/src/operation/command/control.rs index 586d9f5c019..f05c9ca5297 100644 --- a/components/raftstore-v2/src/operation/command/control.rs +++ b/components/raftstore-v2/src/operation/command/control.rs @@ -82,6 +82,7 @@ pub struct ProposalControl { // should be empty or 1 element. And access speed is not a concern. proposed_admin_cmd: LinkedList, has_pending_prepare_merge: bool, + // Commit index of prepare merge. applied_prepare_merge_index: u64, term: u64, } diff --git a/components/raftstore-v2/src/operation/command/mod.rs b/components/raftstore-v2/src/operation/command/mod.rs index 2f2df5a0333..e68449e8026 100644 --- a/components/raftstore-v2/src/operation/command/mod.rs +++ b/components/raftstore-v2/src/operation/command/mod.rs @@ -37,9 +37,12 @@ use raftstore::{ Proposal, }, local_metrics::RaftMetrics, - metrics::{APPLY_TASK_WAIT_TIME_HISTOGRAM, APPLY_TIME_HISTOGRAM}, + metrics::{ + APPLY_TASK_WAIT_TIME_HISTOGRAM, APPLY_TIME_HISTOGRAM, STORE_APPLY_LOG_HISTOGRAM, + }, msg::ErrorCallback, - util, Config, Transport, WriteCallback, + util::{self, check_flashback_state}, + Config, Transport, WriteCallback, }, Error, Result, }; @@ -100,7 +103,6 @@ pub struct CommittedEntries { /// Entries need to be applied. Note some entries may not be included for /// flow control. pub entry_and_proposals: Vec<(Entry, Vec)>, - pub committed_time: Instant, } fn new_response(header: &RaftRequestHeader) -> RaftCmdResponse { @@ -135,6 +137,7 @@ impl Peer { let logger = self.logger.clone(); let read_scheduler = self.storage().read_scheduler(); let buckets = self.region_buckets_info().bucket_stat().clone(); + let sst_apply_state = self.sst_apply_state().clone(); let (apply_scheduler, mut apply_fsm) = ApplyFsm::new( &store_ctx.cfg, self.peer().clone(), @@ -142,7 +145,9 @@ impl Peer { mailbox, store_ctx.tablet_registry.clone(), read_scheduler, + store_ctx.schedulers.checkpoint.clone(), self.flush_state().clone(), + sst_apply_state, self.storage().apply_trace().log_recovery(), self.entry_storage().applied_term(), buckets, @@ -189,6 +194,29 @@ impl Peer { } return Err(e); } + // Check whether the region is in the flashback state and the request could be + // proposed. Skip the not prepared error because the + // `self.region().is_in_flashback` may not be the latest right after applying + // the `PrepareFlashback` admin command, we will let it pass here and check in + // the apply phase and because a read-only request doesn't need to be applied, + // so it will be allowed during the flashback progress, for example, a snapshot + // request. + if let Err(e) = util::check_flashback_state( + self.region().get_is_in_flashback(), + self.region().get_flashback_start_ts(), + header, + admin_type, + self.region_id(), + true, + ) { + match e { + Error::FlashbackInProgress(..) => { + metrics.invalid_proposal.flashback_in_progress.inc() + } + _ => unreachable!("{:?}", e), + } + return Err(e); + } Ok(()) } @@ -306,7 +334,6 @@ impl Peer { // memtables in kv engine is flushed. let apply = CommittedEntries { entry_and_proposals, - committed_time: Instant::now(), }; assert!( self.apply_scheduler().is_some() || ctx.router.is_shutdown(), @@ -369,6 +396,7 @@ impl Peer { AdminCmdResult::UpdateGcPeers(state) => self.on_apply_res_update_gc_peers(state), AdminCmdResult::PrepareMerge(res) => self.on_apply_res_prepare_merge(ctx, res), AdminCmdResult::CommitMerge(res) => self.on_apply_res_commit_merge(ctx, res), + AdminCmdResult::Flashback(res) => self.on_apply_res_flashback(ctx, res), } } self.region_buckets_info_mut() @@ -485,6 +513,7 @@ impl Apply { dr.start_key, dr.end_key, dr.notify_only, + self.use_delete_range(), ); } SimpleWrite::Ingest(_) => { @@ -517,14 +546,17 @@ impl Apply { #[inline] pub async fn apply_committed_entries(&mut self, ce: CommittedEntries) { fail::fail_point!("APPLY_COMMITTED_ENTRIES"); - APPLY_TASK_WAIT_TIME_HISTOGRAM - .observe(duration_to_sec(ce.committed_time.saturating_elapsed())); + let now = std::time::Instant::now(); + let apply_wait_time = APPLY_TASK_WAIT_TIME_HISTOGRAM.local(); for (e, ch) in ce.entry_and_proposals { if self.tombstone() { apply::notify_req_region_removed(self.region_id(), ch); continue; } if !e.get_data().is_empty() { + for tracker in ch.write_trackers() { + tracker.observe(now, &apply_wait_time, |t| &mut t.metrics.apply_wait_nanos); + } let mut set_save_point = false; if let Some(wb) = &mut self.write_batch { wb.set_save_point(); @@ -567,6 +599,13 @@ impl Apply { entry.get_term(), ) { Ok(decoder) => { + fail::fail_point!( + "on_apply_write_cmd", + cfg!(release) || self.peer_id() == 3, + |_| { + unimplemented!(); + } + ); util::compare_region_epoch( decoder.header().get_region_epoch(), self.region(), @@ -594,6 +633,7 @@ impl Apply { dr.start_key, dr.end_key, dr.notify_only, + self.use_delete_range(), )?; } SimpleWrite::Ingest(ssts) => { @@ -624,12 +664,22 @@ impl Apply { }; util::check_req_region_epoch(&req, self.region(), true)?; + let header = req.get_header(); + let admin_type = req.admin_request.as_ref().map(|req| req.get_cmd_type()); + check_flashback_state( + self.region().get_is_in_flashback(), + self.region().get_flashback_start_ts(), + header, + admin_type, + self.region_id(), + false, + )?; if req.has_admin_request() { let admin_req = req.get_admin_request(); let (admin_resp, admin_result) = match req.get_admin_request().get_cmd_type() { AdminCmdType::CompactLog => self.apply_compact_log(admin_req, log_index)?, - AdminCmdType::Split => self.apply_split(admin_req, log_index)?, - AdminCmdType::BatchSplit => self.apply_batch_split(admin_req, log_index)?, + AdminCmdType::Split => self.apply_split(admin_req, log_index).await?, + AdminCmdType::BatchSplit => self.apply_batch_split(admin_req, log_index).await?, AdminCmdType::PrepareMerge => self.apply_prepare_merge(admin_req, log_index)?, AdminCmdType::CommitMerge => self.apply_commit_merge(admin_req, log_index).await?, AdminCmdType::RollbackMerge => unimplemented!(), @@ -644,8 +694,9 @@ impl Apply { } AdminCmdType::ComputeHash => unimplemented!(), AdminCmdType::VerifyHash => unimplemented!(), - AdminCmdType::PrepareFlashback => unimplemented!(), - AdminCmdType::FinishFlashback => unimplemented!(), + AdminCmdType::PrepareFlashback | AdminCmdType::FinishFlashback => { + self.apply_flashback(log_index, admin_req)? + } AdminCmdType::BatchSwitchWitness => unimplemented!(), AdminCmdType::UpdateGcPeer => self.apply_update_gc_peer(log_index, admin_req), AdminCmdType::InvalidAdmin => { @@ -681,6 +732,7 @@ impl Apply { dr.get_start_key(), dr.get_end_key(), dr.get_notify_only(), + self.use_delete_range(), )?; } _ => unimplemented!(), @@ -787,7 +839,14 @@ impl Apply { let apply_time = APPLY_TIME_HISTOGRAM.local(); for (ch, resp) in callbacks.drain(..) { for tracker in ch.write_trackers() { - tracker.observe(now, &apply_time, |t| &mut t.metrics.apply_time_nanos); + let mut apply_wait_nanos = 0_u64; + let apply_time_nanos = tracker.observe(now, &apply_time, |t| { + apply_wait_nanos = t.metrics.apply_wait_nanos; + &mut t.metrics.apply_time_nanos + }); + STORE_APPLY_LOG_HISTOGRAM.observe(duration_to_sec(Duration::from_nanos( + apply_time_nanos - apply_wait_nanos, + ))); } ch.set_result(resp); } diff --git a/components/raftstore-v2/src/operation/command/write/ingest.rs b/components/raftstore-v2/src/operation/command/write/ingest.rs index bc15765437f..90382de24aa 100644 --- a/components/raftstore-v2/src/operation/command/write/ingest.rs +++ b/components/raftstore-v2/src/operation/command/write/ingest.rs @@ -67,12 +67,34 @@ impl Peer { ctx: &mut StoreContext, ssts: Box<[SstMeta]>, ) { - let epoch = self.region().get_region_epoch(); let mut stale_ssts = Vec::from(ssts); - stale_ssts.retain(|sst| util::is_epoch_stale(sst.get_region_epoch(), epoch)); + let epoch = self.region().get_region_epoch(); + stale_ssts.retain(|sst| { + fail::fail_point!("on_cleanup_import_sst", |_| true); + util::is_epoch_stale(sst.get_region_epoch(), epoch) + }); + + // some sst needs to be kept if the log didn't flush the disk. + let flushed_indexes = self.storage().apply_trace().flushed_indexes(); + stale_ssts.retain(|sst| { + let off = data_cf_offset(sst.get_cf_name()); + let uuid = sst.get_uuid().to_vec(); + let sst_index = self.sst_apply_state().sst_applied_index(&uuid); + if let Some(index) = sst_index { + return flushed_indexes.as_ref()[off] >= index; + } + true + }); + + fail::fail_point!("on_cleanup_import_sst_schedule"); if stale_ssts.is_empty() { return; } + let uuids = stale_ssts + .iter() + .map(|sst| sst.get_uuid().to_vec()) + .collect(); + self.sst_apply_state().delete_ssts(uuids); let _ = ctx .schedulers .tablet @@ -116,6 +138,11 @@ impl Apply { slog_panic!(self.logger, "ingest fail"; "ssts" => ?ssts, "error" => ?e); } } + let uuids = infos + .iter() + .map(|info| info.meta.get_uuid().to_vec()) + .collect::>(); + self.set_sst_applied_index(uuids, index); Ok(()) } } diff --git a/components/raftstore-v2/src/operation/command/write/mod.rs b/components/raftstore-v2/src/operation/command/write/mod.rs index 9f4afec9ad6..a12d3e68f45 100644 --- a/components/raftstore-v2/src/operation/command/write/mod.rs +++ b/components/raftstore-v2/src/operation/command/write/mod.rs @@ -1,6 +1,9 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. -use engine_traits::{data_cf_offset, KvEngine, Mutable, RaftEngine, CF_DEFAULT}; +use engine_traits::{ + data_cf_offset, DeleteStrategy, KvEngine, Mutable, RaftEngine, Range as EngineRange, ALL_CFS, + CF_DEFAULT, +}; use kvproto::raft_cmdpb::RaftRequestHeader; use raftstore::{ store::{ @@ -12,7 +15,8 @@ use raftstore::{ }, Error, Result, }; -use tikv_util::slog_panic; +use slog::info; +use tikv_util::{box_err, slog_panic}; use crate::{ batch::StoreContext, @@ -222,13 +226,94 @@ impl Apply { #[inline] pub fn apply_delete_range( &mut self, - _cf: &str, - _index: u64, - _start_key: &[u8], - _end_key: &[u8], - _notify_only: bool, + mut cf: &str, + index: u64, + start_key: &[u8], + end_key: &[u8], + notify_only: bool, + use_delete_range: bool, ) -> Result<()> { - // TODO: reuse the same delete as split/merge. + PEER_WRITE_CMD_COUNTER.delete_range.inc(); + let off = data_cf_offset(cf); + if self.should_skip(off, index) { + return Ok(()); + } + if !end_key.is_empty() && start_key >= end_key { + return Err(box_err!( + "invalid delete range command, start_key: {:?}, end_key: {:?}", + start_key, + end_key + )); + } + // region key range has no data prefix, so we must use origin key to check. + util::check_key_in_region(start_key, self.region())?; + let end_key = keys::data_end_key(end_key); + let region_end_key = keys::data_end_key(self.region().get_end_key()); + if end_key > region_end_key { + return Err(Error::KeyNotInRegion( + end_key.to_vec(), + self.region().clone(), + )); + } + + if cf.is_empty() { + cf = CF_DEFAULT; + } + + if !ALL_CFS.iter().any(|x| *x == cf) { + return Err(box_err!("invalid delete range command, cf: {:?}", cf)); + } + + let start_key = keys::data_key(start_key); + + info!( + self.logger, + "execute delete range"; + "range_start" => log_wrappers::Value::key(&start_key), + "range_end" => log_wrappers::Value::key(&end_key), + "notify_only" => notify_only, + "use_delete_range" => use_delete_range, + ); + + // Use delete_files_in_range to drop as many sst files as possible, this + // is a way to reclaim disk space quickly after drop a table/index. + if !notify_only { + let range = vec![EngineRange::new(&start_key, &end_key)]; + let fail_f = |e: engine_traits::Error, strategy: DeleteStrategy| { + slog_panic!( + self.logger, + "failed to delete"; + "strategy" => ?strategy, + "range_start" => log_wrappers::Value::key(&start_key), + "range_end" => log_wrappers::Value::key(&end_key), + "error" => ?e, + ) + }; + let tablet = self.tablet(); + tablet + .delete_ranges_cf(cf, DeleteStrategy::DeleteFiles, &range) + .unwrap_or_else(|e| fail_f(e, DeleteStrategy::DeleteFiles)); + + let strategy = if use_delete_range { + DeleteStrategy::DeleteByRange + } else { + DeleteStrategy::DeleteByKey + }; + // Delete all remaining keys. + tablet + .delete_ranges_cf(cf, strategy.clone(), &range) + .unwrap_or_else(move |e| fail_f(e, strategy)); + + // to do: support titan? + // tablet + // .delete_ranges_cf(cf, DeleteStrategy::DeleteBlobs, &range) + // .unwrap_or_else(move |e| fail_f(e, + // DeleteStrategy::DeleteBlobs)); + } + if index != u64::MAX { + self.modifications_mut()[off] = index; + } + Ok(()) } } diff --git a/components/raftstore-v2/src/operation/misc.rs b/components/raftstore-v2/src/operation/misc.rs new file mode 100644 index 00000000000..c2c3d643965 --- /dev/null +++ b/components/raftstore-v2/src/operation/misc.rs @@ -0,0 +1,106 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::collections::{ + Bound::{Excluded, Unbounded}, + HashSet, +}; + +use engine_traits::{KvEngine, RaftEngine, CF_DEFAULT, CF_WRITE}; +use slog::{debug, error, info}; + +use crate::{ + fsm::StoreFsmDelegate, + router::StoreTick, + worker::cleanup::{self, CompactThreshold}, + CompactTask::CheckAndCompact, +}; + +impl<'a, EK: KvEngine, ER: RaftEngine, T> StoreFsmDelegate<'a, EK, ER, T> { + pub fn register_compact_check_tick(&mut self) { + self.schedule_tick( + StoreTick::CompactCheck, + self.store_ctx.cfg.region_compact_check_interval.0, + ) + } + + pub fn on_compact_check_tick(&mut self) { + self.register_compact_check_tick(); + if self.store_ctx.schedulers.cleanup.is_busy() { + info!( + self.store_ctx.logger, + "compact worker is busy, check space redundancy next time"; + ); + return; + } + + // Use HashSet here as the region end_keys in store_meta is not unique. + let mut regions_to_check: HashSet = HashSet::default(); + + let (largest_end_key, last_check_key) = { + // Start from last checked key. + let mut last_check_key = self.fsm.store.last_compact_checked_key(); + + let meta = self.store_ctx.store_meta.lock().unwrap(); + if meta.region_ranges.is_empty() { + debug!( + self.store_ctx.logger, + "there is no range need to check"; + ); + return; + } + // Collect continuous ranges. + let ranges = meta.region_ranges.range(( + Excluded((last_check_key.clone(), u64::MAX)), + Unbounded::<(Vec, u64)>, + )); + + for region_range in ranges { + last_check_key = ®ion_range.0.0; + regions_to_check.insert(*region_range.1); + + if regions_to_check.len() >= self.store_ctx.cfg.region_compact_check_step() as usize + { + break; + } + } + + ( + meta.region_ranges.keys().last().unwrap().0.to_vec(), + last_check_key.clone(), + ) + }; + + if largest_end_key == last_check_key { + // Next task will start from the very beginning. + self.fsm + .store + .set_last_compact_checked_key(keys::DATA_MIN_KEY.to_vec()); + } else { + self.fsm.store.set_last_compact_checked_key(last_check_key); + } + + // Schedule the task. + let cf_names = vec![CF_DEFAULT.to_owned(), CF_WRITE.to_owned()]; + if let Err(e) = self + .store_ctx + .schedulers + .cleanup + .schedule(cleanup::Task::Compact(CheckAndCompact { + cf_names, + region_ids: regions_to_check.into_iter().collect::>(), + compact_threshold: CompactThreshold::new( + self.store_ctx.cfg.region_compact_min_tombstones, + self.store_ctx.cfg.region_compact_tombstones_percent, + self.store_ctx.cfg.region_compact_min_redundant_rows, + self.store_ctx.cfg.region_compact_redundant_rows_percent, + ), + })) + { + error!( + self.store_ctx.logger, + "schedule space check task failed"; + "err" => ?e, + ); + } + } +} diff --git a/components/raftstore-v2/src/operation/mod.rs b/components/raftstore-v2/src/operation/mod.rs index f5eb4ebdb6f..e0107122da9 100644 --- a/components/raftstore-v2/src/operation/mod.rs +++ b/components/raftstore-v2/src/operation/mod.rs @@ -3,6 +3,7 @@ mod bucket; mod command; mod life; +mod misc; mod pd; mod query; mod ready; diff --git a/components/raftstore-v2/src/operation/pd.rs b/components/raftstore-v2/src/operation/pd.rs index f45cae390da..7ad82959fa8 100644 --- a/components/raftstore-v2/src/operation/pd.rs +++ b/components/raftstore-v2/src/operation/pd.rs @@ -46,8 +46,8 @@ impl Store { let snap_stats = ctx.snap_mgr.stats(); // todo: imple snapshot status report - stats.set_sending_snap_count(0); - stats.set_receiving_snap_count(0); + stats.set_sending_snap_count(snap_stats.sending_count as u32); + stats.set_receiving_snap_count(snap_stats.receiving_count as u32); stats.set_snapshot_stats(snap_stats.stats.into()); STORE_SNAPSHOT_TRAFFIC_GAUGE_VEC diff --git a/components/raftstore-v2/src/operation/query/capture.rs b/components/raftstore-v2/src/operation/query/capture.rs index 5fdbde187e4..9debb8e0364 100644 --- a/components/raftstore-v2/src/operation/query/capture.rs +++ b/components/raftstore-v2/src/operation/query/capture.rs @@ -19,6 +19,7 @@ use raftstore::{ }, }; use slog::info; +use txn_types::WriteBatchFlags; use crate::{ fsm::{ApplyResReporter, PeerFsmDelegate}, @@ -31,20 +32,26 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: raftstore::store::Transport> { pub fn on_leader_callback(&mut self, ch: QueryResChannel) { let peer = self.fsm.peer(); - let msg = new_read_index_request( + let mut msg = new_read_index_request( peer.region_id(), peer.region().get_region_epoch().clone(), peer.peer().clone(), ); + + // Allow to capture change even is in flashback state. + // TODO: add a test case for this kind of situation. + if self.fsm.peer().region().get_is_in_flashback() { + let mut flags = WriteBatchFlags::from_bits_check(msg.get_header().get_flags()); + flags.insert(WriteBatchFlags::FLASHBACK); + msg.mut_header().set_flags(flags.bits()); + } + self.on_query(msg, ch); } pub fn on_capture_change(&mut self, capture_change: CaptureChange) { fail_point!("raft_on_capture_change"); - // TODO: Allow to capture change even is in flashback state. - // TODO: add a test case for this kind of situation. - let apply_router = self.fsm.peer().apply_scheduler().unwrap().clone(); let (ch, _) = QueryResChannel::with_callback(Box::new(move |res| { if let QueryResult::Response(resp) = res && resp.get_header().has_error() { @@ -175,7 +182,7 @@ mod test { kv::{KvTestEngine, TestTabletFactory}, }; use engine_traits::{ - FlushState, Peekable, TabletContext, TabletRegistry, CF_DEFAULT, DATA_CFS, + FlushState, Peekable, SstApplyState, TabletContext, TabletRegistry, CF_DEFAULT, DATA_CFS, }; use futures::executor::block_on; use kvproto::{ @@ -193,7 +200,7 @@ mod test { }; use slog::o; use tempfile::TempDir; - use tikv_util::{store::new_peer, time::Instant, worker::dummy_scheduler}; + use tikv_util::{store::new_peer, worker::dummy_scheduler}; use super::*; use crate::{ @@ -309,6 +316,8 @@ mod test { let mut host = CoprocessorHost::::default(); host.registry .register_cmd_observer(0, BoxCmdObserver::new(ob)); + + let (dummy_scheduler, _) = dummy_scheduler(); let mut apply = Apply::new( &Config::default(), region @@ -322,11 +331,13 @@ mod test { reg, read_scheduler, Arc::new(FlushState::new(5)), + SstApplyState::default(), None, 5, None, importer, host, + dummy_scheduler, logger.clone(), ); @@ -357,7 +368,6 @@ mod test { ), vec![], )], - committed_time: Instant::now(), }), ApplyTask::CaptureApply(CaptureChange { observer: ChangeObserver::from_cdc(region.id, ObserveHandle::new()), @@ -376,7 +386,6 @@ mod test { ), vec![], )], - committed_time: Instant::now(), }), ]; diff --git a/components/raftstore-v2/src/operation/query/lease.rs b/components/raftstore-v2/src/operation/query/lease.rs index 3185f1bd24b..f76d724f06c 100644 --- a/components/raftstore-v2/src/operation/query/lease.rs +++ b/components/raftstore-v2/src/operation/query/lease.rs @@ -4,10 +4,17 @@ use std::sync::Mutex; use engine_traits::{KvEngine, RaftEngine}; use kvproto::raft_cmdpb::RaftCmdRequest; -use raftstore::store::{ - can_amend_read, fsm::apply::notify_stale_req, metrics::RAFT_READ_INDEX_PENDING_COUNT, - msg::ReadCallback, propose_read_index, should_renew_lease, util::LeaseState, ReadDelegate, - ReadIndexRequest, ReadProgress, Transport, +use raft::{ + eraftpb::{self, MessageType}, + Storage, +}; +use raftstore::{ + store::{ + can_amend_read, fsm::apply::notify_stale_req, metrics::RAFT_READ_INDEX_PENDING_COUNT, + msg::ReadCallback, propose_read_index, should_renew_lease, util::LeaseState, ReadDelegate, + ReadIndexRequest, ReadProgress, Transport, + }, + Error, Result, }; use slog::debug; use tikv_util::time::monotonic_raw_now; @@ -22,6 +29,72 @@ use crate::{ }; impl Peer { + pub fn on_step_read_index( + &mut self, + ctx: &mut StoreContext, + m: &mut eraftpb::Message, + ) -> bool { + assert_eq!(m.get_msg_type(), MessageType::MsgReadIndex); + + fail::fail_point!("on_step_read_index_msg"); + ctx.coprocessor_host + .on_step_read_index(m, self.state_role()); + // Must use the commit index of `PeerStorage` instead of the commit index + // in raft-rs which may be greater than the former one. + // For more details, see the annotations above `on_leader_commit_idx_changed`. + let index = self.storage().entry_storage().commit_index(); + // Check if the log term of this index is equal to current term, if so, + // this index can be used to reply the read index request if the leader holds + // the lease. Please also take a look at raft-rs. + if self.storage().term(index).unwrap() == self.term() { + let state = self.inspect_lease(); + if let LeaseState::Valid = state { + // If current peer has valid lease, then we could handle the + // request directly, rather than send a heartbeat to check quorum. + let mut resp = eraftpb::Message::default(); + resp.set_msg_type(MessageType::MsgReadIndexResp); + resp.term = self.term(); + resp.to = m.from; + + resp.index = index; + resp.set_entries(m.take_entries()); + + self.raft_group_mut().raft.msgs.push(resp); + return true; + } + } + false + } + + pub fn pre_read_index(&self) -> Result<()> { + fail::fail_point!("before_propose_readindex", |s| if s + .map_or(true, |s| s.parse().unwrap_or(true)) + { + Ok(()) + } else { + Err(tikv_util::box_err!( + "[{}] {} can not read due to injected failure", + self.region_id(), + self.peer_id() + )) + }); + + // See more in ready_to_handle_read(). + if self.proposal_control().is_splitting() { + return Err(Error::ReadIndexNotReady { + reason: "can not read index due to split", + region_id: self.region_id(), + }); + } + if self.proposal_control().is_merging() { + return Err(Error::ReadIndexNotReady { + reason: "can not read index due to merge", + region_id: self.region_id(), + }); + } + Ok(()) + } + pub(crate) fn read_index_leader( &mut self, ctx: &mut StoreContext, @@ -56,7 +129,7 @@ impl Peer { .get_mut(0) .filter(|req| req.has_read_index()) .map(|req| req.take_read_index()); - let (id, dropped) = propose_read_index(self.raft_group_mut(), request.as_ref(), None); + let (id, dropped) = propose_read_index(self.raft_group_mut(), request.as_ref()); if dropped { // The message gets dropped silently, can't be handled anymore. notify_stale_req(self.term(), ch); @@ -166,7 +239,7 @@ impl Peer { let term = self.term(); self.leader_lease_mut() .maybe_new_remote_lease(term) - .map(ReadProgress::leader_lease) + .map(ReadProgress::set_leader_lease) }; if let Some(progress) = progress { let mut meta = store_meta.lock().unwrap(); @@ -180,6 +253,15 @@ impl Peer { } } + // Expire lease and unset lease in read delegate on role changed to follower. + pub(crate) fn expire_lease_on_became_follower(&mut self, store_meta: &Mutex>) { + self.leader_lease_mut().expire(); + let mut meta = store_meta.lock().unwrap(); + if let Some((reader, _)) = meta.readers.get_mut(&self.region_id()) { + self.maybe_update_read_progress(reader, ReadProgress::unset_leader_lease()); + } + } + pub(crate) fn maybe_update_read_progress( &self, reader: &mut ReadDelegate, diff --git a/components/raftstore-v2/src/operation/query/local.rs b/components/raftstore-v2/src/operation/query/local.rs index f574571f790..9101b1850e8 100644 --- a/components/raftstore-v2/src/operation/query/local.rs +++ b/components/raftstore-v2/src/operation/query/local.rs @@ -36,7 +36,7 @@ use crate::{ StoreRouter, }; -pub trait MsgRouter: Clone + Send { +pub trait MsgRouter: Clone + Send + 'static { fn send(&self, addr: u64, msg: PeerMsg) -> std::result::Result<(), TrySendError>; } @@ -184,10 +184,15 @@ where Ok(ReadRequestPolicy::StaleRead) => { ReadResult::Ok((delegate, ReadRequestPolicy::StaleRead)) } - // It can not handle other policies. // TODO: we should only abort when lease expires. For other cases we should retry // infinitely. - Ok(ReadRequestPolicy::ReadIndex) => ReadResult::Redirect, + Ok(ReadRequestPolicy::ReadIndex) => { + if req.get_header().get_replica_read() { + ReadResult::Ok((delegate, ReadRequestPolicy::ReadIndex)) + } else { + ReadResult::Redirect + } + } Err(e) => ReadResult::Err(e), } } @@ -195,6 +200,7 @@ where fn try_get_snapshot( &mut self, req: &RaftCmdRequest, + after_read_index: bool, ) -> ReadResult, RaftCmdResponse> { match self.pre_propose_raft_command(req) { ReadResult::Ok((mut delegate, policy)) => { @@ -243,7 +249,26 @@ where .with(|m| m.borrow_mut().local_executed_stale_read_requests.inc()); snap } - _ => unreachable!(), + ReadRequestPolicy::ReadIndex => { + // ReadIndex is returned only for replica read. + if !after_read_index { + // It needs to read index before getting snapshot. + return ReadResult::Redirect; + } + + let region = Arc::clone(&delegate.region); + let snap = RegionSnapshot::from_snapshot( + Arc::new(delegate.cached_tablet.cache().snapshot()), + region, + ); + + TLS_LOCAL_READ_METRICS.with(|m| { + m.borrow_mut().local_executed_requests.inc(); + m.borrow_mut().local_executed_replica_read_requests.inc() + }); + + snap + } }; snap.txn_ext = Some(delegate.txn_ext.clone()); @@ -274,12 +299,13 @@ where pub fn snapshot( &mut self, mut req: RaftCmdRequest, - ) -> impl Future, RaftCmdResponse>> + Send - { + ) -> impl Future, RaftCmdResponse>> + + Send + + 'static { let region_id = req.header.get_ref().region_id; let mut tried_cnt = 0; let res = loop { - let res = self.try_get_snapshot(&req); + let res = self.try_get_snapshot(&req, false /* after_read_index */); match res { ReadResult::Ok(snap) => break Either::Left(Ok(snap)), ReadResult::Err(e) => break Either::Left(Err(e)), @@ -314,7 +340,17 @@ where Some(query_res) => { if query_res.read().is_none() { let QueryResult::Response(res) = query_res else { unreachable!() }; - assert!(res.get_header().has_error(), "{:?}", res); + // Get an error explicitly in header, + // or leader reports KeyIsLocked error via read index. + assert!( + res.get_header().has_error() + || res + .get_responses() + .get(0) + .map_or(false, |r| r.get_read_index().has_locked()), + "{:?}", + res + ); return Err(res); } } @@ -329,7 +365,7 @@ where // If query successful, try again. req.mut_header().set_read_quorum(false); loop { - let r = reader.try_get_snapshot(&req); + let r = reader.try_get_snapshot(&req, true /* after_read_index */); match r { ReadResult::Ok(snap) => return Ok(snap), ReadResult::Err(e) => return Err(e), @@ -366,7 +402,8 @@ where &self, region_id: u64, req: &RaftCmdRequest, - ) -> impl Future, RaftCmdResponse>> { + ) -> impl Future, RaftCmdResponse>> + 'static + { let mut req = req.clone(); // Remote lease is updated step by step. It's possible local reader expires // while the raftstore doesn't. So we need to trigger an update @@ -879,7 +916,7 @@ mod tests { .get_mut(&1) .unwrap() .0 - .update(ReadProgress::leader_lease(remote)); + .update(ReadProgress::set_leader_lease(remote)); }), rx, ch_tx.clone(), diff --git a/components/raftstore-v2/src/operation/query/mod.rs b/components/raftstore-v2/src/operation/query/mod.rs index 81fb4e5e9de..6e130a085dd 100644 --- a/components/raftstore-v2/src/operation/query/mod.rs +++ b/components/raftstore-v2/src/operation/query/mod.rs @@ -164,7 +164,12 @@ impl Peer { return Err(e); } - // TODO: check applying snapshot + // Check whether the peer is initialized. + if !self.storage().is_initialized() { + raft_metrics.invalid_proposal.region_not_initialized.inc(); + let region_id = msg.get_header().get_region_id(); + return Err(Error::RegionNotInitialized(region_id)); + } // Check whether the term is stale. if let Err(e) = util::check_term(msg.get_header(), self.term()) { @@ -186,7 +191,21 @@ impl Peer { req: RaftCmdRequest, ch: QueryResChannel, ) { - // TODO: add pre_read_index to handle splitting or merging + if let Err(e) = self.pre_read_index() { + debug!( + self.logger, + "prevents unsafe read index"; + "err" => ?e, + ); + ctx.raft_metrics.propose.unsafe_read_index.inc(); + let mut resp = RaftCmdResponse::default(); + let term = self.term(); + cmd_resp::bind_term(&mut resp, term); + cmd_resp::bind_error(&mut resp, e); + ch.report_error(resp); + return; + } + if self.is_leader() { self.read_index_leader(ctx, req, ch); } else { @@ -282,7 +301,7 @@ impl Peer { self.storage().apply_state().get_applied_index() >= read_index // If it is in pending merge state(i.e. applied PrepareMerge), the data may be stale. // TODO: Add a test to cover this case - && self.proposal_control().has_applied_prepare_merge() + && !self.proposal_control().has_applied_prepare_merge() } #[inline] diff --git a/components/raftstore-v2/src/operation/query/replica.rs b/components/raftstore-v2/src/operation/query/replica.rs index 901fd9726f6..b4edbd2097a 100644 --- a/components/raftstore-v2/src/operation/query/replica.rs +++ b/components/raftstore-v2/src/operation/query/replica.rs @@ -9,7 +9,7 @@ use raftstore::{ fsm::apply::notify_stale_req, metrics::RAFT_READ_INDEX_PENDING_COUNT, msg::{ErrorCallback, ReadCallback}, - propose_read_index, ReadIndexRequest, Transport, + propose_read_index, Config, ReadIndexContext, ReadIndexRequest, Transport, }, Error, }; @@ -23,6 +23,29 @@ use crate::{ router::{QueryResChannel, QueryResult, ReadResponse}, }; impl Peer { + /// `ReadIndex` requests could be lost in network, so on followers commands + /// could queue in `pending_reads` forever. Sending a new `ReadIndex` + /// periodically can resolve this. + pub fn retry_pending_reads(&mut self, cfg: &Config) { + if self.is_leader() + || !self.pending_reads_mut().check_needs_retry(cfg) + || self.pre_read_index().is_err() + { + return; + } + + let read = self.pending_reads().back().unwrap(); + debug!( + self.logger, + "request to get a read index from follower, retry"; + "request_id" => ?read.id, + ); + let ctx = + ReadIndexContext::fields_to_bytes(read.id, read.addition_request.as_deref(), None); + debug_assert!(read.read_index.is_none()); + self.raft_group_mut().read_index(ctx); + } + /// read index on follower /// /// call set_has_ready if it's proposed. @@ -49,7 +72,8 @@ impl Peer { .get_mut(0) .filter(|req| req.has_read_index()) .map(|req| req.take_read_index()); - let (id, _dropped) = propose_read_index(self.raft_group_mut(), request.as_ref(), None); + // No need to check `dropped` as it only meaningful for leader. + let (id, _dropped) = propose_read_index(self.raft_group_mut(), request.as_ref()); let now = monotonic_raw_now(); let mut read = ReadIndexRequest::with_command(id, req, ch, now); read.addition_request = request.map(Box::new); diff --git a/components/raftstore-v2/src/operation/ready/apply_trace.rs b/components/raftstore-v2/src/operation/ready/apply_trace.rs index 6c9c73479ba..9d7cae00e9d 100644 --- a/components/raftstore-v2/src/operation/ready/apply_trace.rs +++ b/components/raftstore-v2/src/operation/ready/apply_trace.rs @@ -298,10 +298,7 @@ impl ApplyTrace { /// `None` is returned. #[inline] pub fn log_recovery(&self) -> Option> { - let mut flushed_indexes = [0; DATA_CFS_LEN]; - for (off, pr) in self.data_cfs.iter().enumerate() { - flushed_indexes[off] = pr.flushed; - } + let flushed_indexes = self.flushed_indexes(); for i in flushed_indexes { if i > self.admin.flushed { return Some(Box::new(flushed_indexes)); @@ -310,6 +307,16 @@ impl ApplyTrace { None } + /// Get the flushed indexes of all data CF that is needed when recoverying + /// logs. It does not check the admin cf. + pub fn flushed_indexes(&self) -> DataTrace { + let mut flushed_indexes = [0; DATA_CFS_LEN]; + for (off, pr) in self.data_cfs.iter().enumerate() { + flushed_indexes[off] = pr.flushed; + } + flushed_indexes + } + pub fn restore_snapshot(&mut self, index: u64) { for pr in self.data_cfs.iter_mut() { pr.last_modified = index; diff --git a/components/raftstore-v2/src/operation/ready/mod.rs b/components/raftstore-v2/src/operation/ready/mod.rs index 58c7e904037..b1c413466cf 100644 --- a/components/raftstore-v2/src/operation/ready/mod.rs +++ b/components/raftstore-v2/src/operation/ready/mod.rs @@ -96,7 +96,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> PeerFsmDelegate<'a, EK, ER, /// Raft relies on periodic ticks to keep the state machine sync with other /// peers. pub fn on_raft_tick(&mut self) { - if self.fsm.peer_mut().tick() { + if self.fsm.peer_mut().tick(self.store_ctx) { self.fsm.peer_mut().set_has_ready(); } self.fsm.peer_mut().maybe_clean_up_stale_merge_context(); @@ -152,11 +152,15 @@ impl Peer { } #[inline] - fn tick(&mut self) -> bool { + fn tick(&mut self, store_ctx: &mut StoreContext) -> bool { // When it's handling snapshot, it's pointless to tick as all the side // affects have to wait till snapshot is applied. On the other hand, ticking // will bring other corner cases like elections. - !self.is_handling_snapshot() && self.serving() && self.raft_group_mut().tick() + if self.is_handling_snapshot() || !self.serving() { + return false; + } + self.retry_pending_reads(&store_ctx.cfg); + self.raft_group_mut().tick() } pub fn on_peer_unreachable(&mut self, to_peer_id: u64) { @@ -294,7 +298,7 @@ impl Peer { if self.is_leader() { self.add_peer_heartbeat(from_peer.get_id(), Instant::now()); } - // We only cache peer with an vaild ID. + // We only cache peer with an valid ID. // It prevents cache peer(0,0) which is sent by region split. self.insert_peer_cache(from_peer); } @@ -312,6 +316,14 @@ impl Peer { ctx.raft_metrics.message_dropped.stale_msg.inc(); return; } + if msg.get_message().get_msg_type() == MessageType::MsgReadIndex + && self.is_leader() + && self.on_step_read_index(ctx, msg.mut_message()) + { + // Read index has respond in `on_step_read_index`. + return; + } + // As this peer is already created, the empty split message is meaningless. if is_empty_split_message(&msg) { ctx.raft_metrics.message_dropped.stale_msg.inc(); @@ -379,7 +391,7 @@ impl Peer { let to_peer = match self.peer_from_cache(msg.to) { Some(p) => p, None => { - warn!(self.logger, "failed to look up recipient peer"; "to_peer" => msg.to); + warn!(self.logger, "failed to look up recipient peer"; "to_peer" => msg.to, "message_type" => ?msg.msg_type); return None; } }; @@ -607,6 +619,12 @@ impl Peer { |entry| entry.index == self.raft_group().raft.raft_log.last_index() )); + fail::fail_point!( + "before_handle_snapshot_ready_3", + self.peer_id() == 3 && self.get_pending_snapshot().is_some(), + |_| () + ); + self.on_role_changed(ctx, &ready); if let Some(hs) = ready.hs() { @@ -899,7 +917,7 @@ impl Peer { self.maybe_schedule_gc_peer_tick(); } StateRole::Follower => { - self.leader_lease_mut().expire(); + self.expire_lease_on_became_follower(&ctx.store_meta); self.storage_mut().cancel_generating_snap(None); self.txn_context() .on_became_follower(self.term(), self.region()); diff --git a/components/raftstore-v2/src/operation/ready/snapshot.rs b/components/raftstore-v2/src/operation/ready/snapshot.rs index 5547df7d580..97bcf890260 100644 --- a/components/raftstore-v2/src/operation/ready/snapshot.rs +++ b/components/raftstore-v2/src/operation/ready/snapshot.rs @@ -662,6 +662,7 @@ impl Storage { let reg = reg.clone(); let key_manager = key_manager.cloned(); let hook = move || { + fail::fail_point!("region_apply_snap"); if !install_tablet(®, key_manager.as_deref(), &path, region_id, last_index) { slog_panic!( logger, diff --git a/components/raftstore-v2/src/raft/apply.rs b/components/raftstore-v2/src/raft/apply.rs index d32b8bdbb80..82ca99260e8 100644 --- a/components/raftstore-v2/src/raft/apply.rs +++ b/components/raftstore-v2/src/raft/apply.rs @@ -3,7 +3,7 @@ use std::{mem, sync::Arc}; use engine_traits::{ - FlushState, KvEngine, PerfContextKind, TabletRegistry, WriteBatch, DATA_CFS_LEN, + FlushState, KvEngine, PerfContextKind, SstApplyState, TabletRegistry, WriteBatch, DATA_CFS_LEN, }; use kvproto::{metapb, raft_cmdpb::RaftCmdResponse, raft_serverpb::RegionLocalState}; use pd_client::BucketStat; @@ -21,6 +21,7 @@ use tikv_util::{log::SlogFormat, worker::Scheduler}; use crate::{ operation::{AdminCmdResult, ApplyFlowControl, DataTrace}, router::CmdResChannel, + worker::checkpoint, }; pub(crate) struct Observe { @@ -56,6 +57,7 @@ pub struct Apply { modifications: DataTrace, admin_cmd_result: Vec, flush_state: Arc, + sst_apply_state: SstApplyState, /// The flushed indexes of each column family before being restarted. /// /// If an apply index is less than the flushed index, the log can be @@ -71,6 +73,10 @@ pub struct Apply { observe: Observe, coprocessor_host: CoprocessorHost, + checkpoint_scheduler: Scheduler>, + // Whether to use the delete range API instead of deleting one by one. + use_delete_range: bool, + pub(crate) metrics: ApplyMetrics, pub(crate) logger: Logger, pub(crate) buckets: Option, @@ -86,11 +92,13 @@ impl Apply { tablet_registry: TabletRegistry, read_scheduler: Scheduler>, flush_state: Arc, + sst_apply_state: SstApplyState, log_recovery: Option>, applied_term: u64, buckets: Option, sst_importer: Arc, coprocessor_host: CoprocessorHost, + checkpoint_scheduler: Scheduler>, logger: Logger, ) -> Self { let mut remote_tablet = tablet_registry @@ -119,10 +127,13 @@ impl Apply { key_buffer: vec![], res_reporter, flush_state, + sst_apply_state, log_recovery, metrics: ApplyMetrics::default(), buckets, sst_importer, + checkpoint_scheduler, + use_delete_range: cfg.use_delete_range, observe: Observe { info: CmdObserveInfo::default(), level: ObserveLevel::None, @@ -199,6 +210,10 @@ impl Apply { self.region().get_id() } + pub fn peer_id(&self) -> u64 { + self.peer.get_id() + } + /// The tablet can't be public yet, otherwise content of latest tablet /// doesn't matches its epoch in both readers and peer fsm. #[inline] @@ -270,6 +285,11 @@ impl Apply { &self.flush_state } + #[inline] + pub fn set_sst_applied_index(&mut self, uuid: Vec>, apply_index: u64) { + self.sst_apply_state.registe_ssts(uuid, apply_index); + } + #[inline] pub fn log_recovery(&self) -> &Option> { &self.log_recovery @@ -308,4 +328,13 @@ impl Apply { pub fn coprocessor_host(&self) -> &CoprocessorHost { &self.coprocessor_host } + + #[inline] + pub fn checkpoint_scheduler(&self) -> &Scheduler> { + &self.checkpoint_scheduler + } + + pub fn use_delete_range(&self) -> bool { + self.use_delete_range + } } diff --git a/components/raftstore-v2/src/raft/peer.rs b/components/raftstore-v2/src/raft/peer.rs index e11c96922cd..1500737da3b 100644 --- a/components/raftstore-v2/src/raft/peer.rs +++ b/components/raftstore-v2/src/raft/peer.rs @@ -9,14 +9,14 @@ use std::{ use collections::{HashMap, HashSet}; use encryption_export::DataKeyManager; use engine_traits::{ - CachedTablet, FlushState, KvEngine, RaftEngine, TabletContext, TabletRegistry, + CachedTablet, FlushState, KvEngine, RaftEngine, SstApplyState, TabletContext, TabletRegistry, }; use kvproto::{ metapb::{self, PeerRole}, pdpb, - raft_serverpb::{RaftMessage, RegionLocalState}, + raft_serverpb::RaftMessage, }; -use raft::{RawNode, StateRole}; +use raft::{eraftpb, RawNode, StateRole}; use raftstore::{ coprocessor::{CoprocessorHost, RegionChangeEvent, RegionChangeReason}, store::{ @@ -107,6 +107,7 @@ pub struct Peer { /// advancing apply index. state_changes: Option>, flush_state: Arc, + sst_apply_state: SstApplyState, /// lead_transferee if this peer(leader) is in a leadership transferring. leader_transferee: u64, @@ -147,6 +148,7 @@ impl Peer { let region = raft_group.store().region_state().get_region().clone(); let flush_state: Arc = Arc::new(FlushState::new(applied_index)); + let sst_apply_state = SstApplyState::default(); // We can't create tablet if tablet index is 0. It can introduce race when gc // old tablet and create new peer. We also can't get the correct range of the // region, which is required for kv data gc. @@ -199,6 +201,7 @@ impl Peer { split_trace: vec![], state_changes: None, flush_state, + sst_apply_state, split_flow_control: SplitFlowControl::default(), leader_transferee: raft::INVALID_ID, long_uncommitted_threshold: cmp::max( @@ -261,11 +264,12 @@ impl Peer { self.leader_lease.expire_remote_lease(); } - let mut region_state = RegionLocalState::default(); - region_state.set_region(region.clone()); - region_state.set_tablet_index(tablet_index); - region_state.set_state(self.storage().region_state().get_state()); - self.storage_mut().set_region_state(region_state); + self.storage_mut() + .region_state_mut() + .set_region(region.clone()); + self.storage_mut() + .region_state_mut() + .set_tablet_index(tablet_index); let progress = ReadProgress::region(region); // Always update read delegate's region to avoid stale region info after a @@ -281,7 +285,7 @@ impl Peer { if let Some(progress) = self .leader_lease .maybe_new_remote_lease(self.term()) - .map(ReadProgress::leader_lease) + .map(ReadProgress::set_leader_lease) { self.maybe_update_read_progress(reader, progress); } @@ -428,6 +432,11 @@ impl Peer { self.raft_group.raft.raft_log.persisted } + #[inline] + pub fn get_pending_snapshot(&self) -> Option<&eraftpb::Snapshot> { + self.raft_group.snap() + } + #[inline] pub fn self_stat(&self) -> &PeerStat { &self.self_stat @@ -792,6 +801,11 @@ impl Peer { &self.flush_state } + #[inline] + pub fn sst_apply_state(&self) -> &SstApplyState { + &self.sst_apply_state + } + pub fn reset_flush_state(&mut self, index: u64) { self.flush_state = Arc::new(FlushState::new(index)); } diff --git a/components/raftstore-v2/src/raft/storage.rs b/components/raftstore-v2/src/raft/storage.rs index 7edf8c02f09..298ba5d451f 100644 --- a/components/raftstore-v2/src/raft/storage.rs +++ b/components/raftstore-v2/src/raft/storage.rs @@ -321,7 +321,8 @@ mod tests { kv::{KvTestEngine, TestTabletFactory}, }; use engine_traits::{ - FlushState, RaftEngine, RaftLogBatch, TabletContext, TabletRegistry, DATA_CFS, + FlushState, RaftEngine, RaftLogBatch, SstApplyState, TabletContext, TabletRegistry, + DATA_CFS, }; use kvproto::{ metapb::{Peer, Region}, @@ -338,7 +339,7 @@ mod tests { }; use slog::o; use tempfile::TempDir; - use tikv_util::worker::Worker; + use tikv_util::worker::{dummy_scheduler, Worker}; use super::*; use crate::{ @@ -506,6 +507,8 @@ mod tests { state.set_region(region.clone()); let (_tmp_dir, importer) = create_tmp_importer(); let host = CoprocessorHost::::default(); + + let (dummy_scheduler, _) = dummy_scheduler(); // setup peer applyer let mut apply = Apply::new( &Config::default(), @@ -515,11 +518,13 @@ mod tests { reg, sched, Arc::new(FlushState::new(5)), + SstApplyState::default(), None, 5, None, importer, host, + dummy_scheduler, logger, ); diff --git a/components/raftstore-v2/src/router/imp.rs b/components/raftstore-v2/src/router/imp.rs index b28dc95aa35..325e4ee4a1b 100644 --- a/components/raftstore-v2/src/router/imp.rs +++ b/components/raftstore-v2/src/router/imp.rs @@ -159,8 +159,9 @@ impl RaftRouter { pub fn snapshot( &mut self, req: RaftCmdRequest, - ) -> impl Future, RaftCmdResponse>> + Send - { + ) -> impl Future, RaftCmdResponse>> + + Send + + 'static { self.local_reader.snapshot(req) } diff --git a/components/raftstore-v2/src/router/message.rs b/components/raftstore-v2/src/router/message.rs index 3f761c74f94..9ce4e8a8807 100644 --- a/components/raftstore-v2/src/router/message.rs +++ b/components/raftstore-v2/src/router/message.rs @@ -89,6 +89,7 @@ pub enum StoreTick { SnapGc, ConsistencyCheck, CleanupImportSst, + CompactCheck, } impl StoreTick { @@ -99,6 +100,7 @@ impl StoreTick { StoreTick::SnapGc => RaftEventDurationType::snap_gc, StoreTick::ConsistencyCheck => RaftEventDurationType::consistency_check, StoreTick::CleanupImportSst => RaftEventDurationType::cleanup_import_sst, + StoreTick::CompactCheck => RaftEventDurationType::compact_check, } } } diff --git a/components/raftstore-v2/src/worker/checkpoint.rs b/components/raftstore-v2/src/worker/checkpoint.rs new file mode 100644 index 00000000000..e10f62584d5 --- /dev/null +++ b/components/raftstore-v2/src/worker/checkpoint.rs @@ -0,0 +1,132 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{fmt::Display, path::PathBuf, time::Duration}; + +use engine_traits::{Checkpointer, KvEngine, TabletRegistry}; +use futures::channel::oneshot::Sender; +use raftstore::store::RAFT_INIT_LOG_INDEX; +use slog::Logger; +use tikv_util::{slog_panic, time::Instant, worker::Runnable}; + +use crate::operation::SPLIT_PREFIX; + +pub enum Task { + Checkpoint { + // it is only used to assert + log_index: u64, + parent_region: u64, + split_regions: Vec, + tablet: EK, + sender: Sender, + }, +} + +impl Display for Task { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Task::Checkpoint { + log_index, + parent_region, + split_regions, + .. + } => write!( + f, + "create checkpoint for batch split, parent region_id {}, source region_ids {:?}, log_index {}", + parent_region, split_regions, log_index, + ), + } + } +} + +pub struct Runner { + logger: Logger, + tablet_registry: TabletRegistry, +} + +pub fn temp_split_path(registry: &TabletRegistry, region_id: u64) -> PathBuf { + let tablet_name = registry.tablet_name(SPLIT_PREFIX, region_id, RAFT_INIT_LOG_INDEX); + registry.tablet_root().join(tablet_name) +} + +impl Runner { + pub fn new(logger: Logger, tablet_registry: TabletRegistry) -> Self { + Self { + logger, + tablet_registry, + } + } + + fn checkpoint( + &self, + parent_region: u64, + split_regions: Vec, + log_index: u64, + tablet: EK, + sender: Sender, + ) { + let now = Instant::now(); + let mut checkpointer = tablet.new_checkpointer().unwrap_or_else(|e| { + slog_panic!( + self.logger, + "fails to create checkpoint object"; + "region_id" => parent_region, + "error" => ?e + ) + }); + + for id in split_regions { + let split_temp_path = temp_split_path(&self.tablet_registry, id); + checkpointer + .create_at(&split_temp_path, None, 0) + .unwrap_or_else(|e| { + slog_panic!( + self.logger, + "fails to create checkpoint"; + "region_id" => parent_region, + "path" => %split_temp_path.display(), + "error" => ?e + ) + }); + } + + let derived_path = self.tablet_registry.tablet_path(parent_region, log_index); + + // If it's recovered from restart, it's possible the target path exists already. + // And because checkpoint is atomic, so we don't need to worry about corruption. + // And it's also wrong to delete it and remake as it may has applied and flushed + // some data to the new checkpoint before being restarted. + if !derived_path.exists() { + checkpointer + .create_at(&derived_path, None, 0) + .unwrap_or_else(|e| { + slog_panic!( + self.logger, + "fails to create checkpoint"; + "region_id" => parent_region, + "path" => %derived_path.display(), + "error" => ?e + ) + }); + } + + sender.send(now.saturating_elapsed()).unwrap(); + } +} + +impl Runnable for Runner { + type Task = Task; + + fn run(&mut self, task: Self::Task) { + match task { + Task::Checkpoint { + log_index, + parent_region, + split_regions, + tablet, + sender, + } => { + self.checkpoint(parent_region, split_regions, log_index, tablet, sender); + } + } + } +} diff --git a/components/raftstore-v2/src/worker/cleanup/compact.rs b/components/raftstore-v2/src/worker/cleanup/compact.rs new file mode 100644 index 00000000000..c7d7aef897d --- /dev/null +++ b/components/raftstore-v2/src/worker/cleanup/compact.rs @@ -0,0 +1,365 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + error::Error as StdError, + fmt::{self, Display, Formatter}, +}; + +use engine_traits::{KvEngine, RangeStats, TabletRegistry, CF_WRITE}; +use fail::fail_point; +use keys::{DATA_MAX_KEY, DATA_MIN_KEY}; +use slog::{debug, error, info, warn, Logger}; +use thiserror::Error; +use tikv_util::{box_try, worker::Runnable}; + +pub enum Task { + CheckAndCompact { + // Column families need to compact + cf_names: Vec, + region_ids: Vec, + compact_threshold: CompactThreshold, + }, +} + +pub struct CompactThreshold { + tombstones_num_threshold: u64, + tombstones_percent_threshold: u64, + redundant_rows_threshold: u64, + redundant_rows_percent_threshold: u64, +} + +impl CompactThreshold { + pub fn new( + tombstones_num_threshold: u64, + tombstones_percent_threshold: u64, + redundant_rows_threshold: u64, + redundant_rows_percent_threshold: u64, + ) -> Self { + Self { + tombstones_num_threshold, + tombstones_percent_threshold, + redundant_rows_percent_threshold, + redundant_rows_threshold, + } + } +} + +impl Display for Task { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match *self { + Task::CheckAndCompact { + ref cf_names, + ref region_ids, + ref compact_threshold, + } => f + .debug_struct("CheckAndCompact") + .field("cf_names", cf_names) + .field("regions", region_ids) + .field( + "tombstones_num_threshold", + &compact_threshold.tombstones_num_threshold, + ) + .field( + "tombstones_percent_threshold", + &compact_threshold.tombstones_percent_threshold, + ) + .field( + "redundant_rows_threshold", + &compact_threshold.redundant_rows_threshold, + ) + .field( + "redundant_rows_percent_threshold", + &compact_threshold.redundant_rows_percent_threshold, + ) + .finish(), + } + } +} + +#[derive(Debug, Error)] +pub enum Error { + #[error("compact failed {0:?}")] + Other(#[from] Box), +} + +pub struct Runner { + logger: Logger, + tablet_registry: TabletRegistry, +} + +impl Runner +where + E: KvEngine, +{ + pub fn new(tablet_registry: TabletRegistry, logger: Logger) -> Runner { + Runner { + logger, + tablet_registry, + } + } +} + +impl Runnable for Runner +where + E: KvEngine, +{ + type Task = Task; + + fn run(&mut self, task: Self::Task) { + match task { + Task::CheckAndCompact { + cf_names, + region_ids, + compact_threshold, + } => match collect_regions_to_compact( + &self.tablet_registry, + region_ids, + compact_threshold, + &self.logger, + ) { + Ok(mut region_ids) => { + for region_id in region_ids.drain(..) { + let Some(mut tablet_cache) = self.tablet_registry.get(region_id) else {continue}; + let Some(tablet) = tablet_cache.latest() else {continue}; + for cf in &cf_names { + if let Err(e) = + tablet.compact_range_cf(cf, None, None, false, 1 /* threads */) + { + error!( + self.logger, + "compact range failed"; + "region_id" => region_id, + "cf" => cf, + "err" => %e, + ); + } + } + info!( + self.logger, + "compaction range finished"; + "region_id" => region_id, + ); + fail_point!("raftstore-v2::CheckAndCompact::AfterCompact"); + } + } + Err(e) => warn!( + self.logger, + "check ranges need reclaim failed"; "err" => %e + ), + }, + } + } +} + +fn need_compact(range_stats: &RangeStats, compact_threshold: &CompactThreshold) -> bool { + if range_stats.num_entries < range_stats.num_versions { + return false; + } + + // We trigger region compaction when their are to many tombstones as well as + // redundant keys, both of which can severly impact scan operation: + let estimate_num_del = range_stats.num_entries - range_stats.num_versions; + let redundant_keys = range_stats.num_entries - range_stats.num_rows; + (redundant_keys >= compact_threshold.redundant_rows_threshold + && redundant_keys * 100 + >= compact_threshold.redundant_rows_percent_threshold * range_stats.num_entries) + || (estimate_num_del >= compact_threshold.tombstones_num_threshold + && estimate_num_del * 100 + >= compact_threshold.tombstones_percent_threshold * range_stats.num_entries) +} + +fn collect_regions_to_compact( + reg: &TabletRegistry, + region_ids: Vec, + compact_threshold: CompactThreshold, + logger: &Logger, +) -> Result, Error> { + fail_point!("on_collect_regions_to_compact"); + debug!( + logger, + "received compaction check"; + "regions" => ?region_ids + ); + let mut regions_to_compact = vec![]; + for id in region_ids { + let Some(mut tablet_cache) = reg.get(id) else {continue}; + let Some(tablet) = tablet_cache.latest() else {continue}; + if tablet.auto_compactions_is_disabled().expect("cf") { + info!( + logger, + "skip compact check when disabled auto compactions"; + "region_id" => id, + ); + continue; + } + + if let Some(range_stats) = + box_try!(tablet.get_range_stats(CF_WRITE, DATA_MIN_KEY, DATA_MAX_KEY)) + { + info!( + logger, + "get range entries and versions"; + "num_entries" => range_stats.num_entries, + "num_versions" => range_stats.num_versions, + "num_rows" => range_stats.num_rows, + "region_id" => id, + ); + if need_compact(&range_stats, &compact_threshold) { + regions_to_compact.push(id); + } + } + } + Ok(regions_to_compact) +} + +#[cfg(test)] +mod tests { + use engine_test::{ + ctor::{CfOptions, DbOptions}, + kv::{KvTestEngine, TestTabletFactory}, + }; + use engine_traits::{MiscExt, SyncMutable, TabletContext, TabletRegistry, CF_DEFAULT, CF_LOCK}; + use keys::data_key; + use kvproto::metapb::Region; + use tempfile::Builder; + use txn_types::{Key, TimeStamp, Write, WriteType}; + + use super::*; + + fn build_test_factory(name: &'static str) -> (tempfile::TempDir, TabletRegistry) { + let dir = Builder::new().prefix(name).tempdir().unwrap(); + let mut cf_opts = CfOptions::new(); + cf_opts.set_level_zero_file_num_compaction_trigger(8); + let factory = Box::new(TestTabletFactory::new( + DbOptions::default(), + vec![ + (CF_DEFAULT, CfOptions::new()), + (CF_LOCK, CfOptions::new()), + (CF_WRITE, cf_opts), + ], + )); + let registry = TabletRegistry::new(factory, dir.path()).unwrap(); + (dir, registry) + } + + fn mvcc_put(db: &KvTestEngine, k: &[u8], v: &[u8], start_ts: TimeStamp, commit_ts: TimeStamp) { + let k = Key::from_encoded(data_key(k)).append_ts(commit_ts); + let w = Write::new(WriteType::Put, start_ts, Some(v.to_vec())); + db.put_cf(CF_WRITE, k.as_encoded(), &w.as_ref().to_bytes()) + .unwrap(); + } + + fn delete(db: &KvTestEngine, k: &[u8], commit_ts: TimeStamp) { + let k = Key::from_encoded(data_key(k)).append_ts(commit_ts); + db.delete_cf(CF_WRITE, k.as_encoded()).unwrap(); + } + + #[test] + fn test_compact_range() { + let (_dir, registry) = build_test_factory("compact-range-test"); + + let mut region = Region::default(); + region.set_id(2); + let ctx = TabletContext::new(®ion, Some(5)); + let mut cache = registry.load(ctx, true).unwrap(); + let tablet = cache.latest().unwrap(); + + // mvcc_put 0..5 + for i in 0..5 { + let (k, v) = (format!("k{}", i), format!("value{}", i)); + mvcc_put(tablet, k.as_bytes(), v.as_bytes(), 1.into(), 2.into()); + mvcc_put(tablet, k.as_bytes(), v.as_bytes(), 3.into(), 4.into()); + } + tablet.flush_cf(CF_WRITE, true).unwrap(); + + // gc 0..5 + for i in 0..5 { + let k = format!("k{}", i); + delete(tablet, k.as_bytes(), 4.into()); + } + tablet.flush_cf(CF_WRITE, true).unwrap(); + + let (start, end) = (data_key(b"k0"), data_key(b"k5")); + let range_stats = tablet + .get_range_stats(CF_WRITE, &start, &end) + .unwrap() + .unwrap(); + assert_eq!(range_stats.num_entries, 15); + assert_eq!(range_stats.num_versions, 10); + assert_eq!(range_stats.num_rows, 5); + + region.set_id(3); + let ctx = TabletContext::new(®ion, Some(5)); + let mut cache = registry.load(ctx, true).unwrap(); + let tablet = cache.latest().unwrap(); + // mvcc_put 5..10 + for i in 5..10 { + let (k, v) = (format!("k{}", i), format!("value{}", i)); + mvcc_put(tablet, k.as_bytes(), v.as_bytes(), 1.into(), 2.into()); + } + for i in 5..8 { + let (k, v) = (format!("k{}", i), format!("value{}", i)); + mvcc_put(tablet, k.as_bytes(), v.as_bytes(), 3.into(), 4.into()); + } + tablet.flush_cf(CF_WRITE, true).unwrap(); + + let (s, e) = (data_key(b"k5"), data_key(b"k9")); + let range_stats = tablet.get_range_stats(CF_WRITE, &s, &e).unwrap().unwrap(); + assert_eq!(range_stats.num_entries, 8); + assert_eq!(range_stats.num_versions, 8); + assert_eq!(range_stats.num_rows, 5); + + // gc 5..8 + for i in 5..8 { + let k = format!("k{}", i); + delete(tablet, k.as_bytes(), 4.into()); + } + tablet.flush_cf(CF_WRITE, true).unwrap(); + + let (s, e) = (data_key(b"k5"), data_key(b"k9")); + let range_stats = tablet.get_range_stats(CF_WRITE, &s, &e).unwrap().unwrap(); + assert_eq!(range_stats.num_entries, 11); + assert_eq!(range_stats.num_versions, 8); + assert_eq!(range_stats.num_rows, 5); + + let logger = slog_global::borrow_global().new(slog::o!()); + + // collect regions according to tombstone's parameters + let regions = collect_regions_to_compact( + ®istry, + vec![2, 3, 4], + CompactThreshold::new(4, 30, 100, 100), + &logger, + ) + .unwrap(); + assert!(regions.len() == 1 && regions[0] == 2); + + let regions = collect_regions_to_compact( + ®istry, + vec![2, 3, 4], + CompactThreshold::new(3, 25, 100, 100), + &logger, + ) + .unwrap(); + assert!(regions.len() == 2 && !regions.contains(&4)); + + // collect regions accroding to redundant rows' parameter + let regions = collect_regions_to_compact( + ®istry, + vec![2, 3, 4], + CompactThreshold::new(100, 100, 9, 60), + &logger, + ) + .unwrap(); + assert!(regions.len() == 1 && regions[0] == 2); + + let regions = collect_regions_to_compact( + ®istry, + vec![2, 3, 4], + CompactThreshold::new(100, 100, 5, 50), + &logger, + ) + .unwrap(); + assert!(regions.len() == 2 && !regions.contains(&4)); + } +} diff --git a/components/raftstore-v2/src/worker/cleanup/mod.rs b/components/raftstore-v2/src/worker/cleanup/mod.rs new file mode 100644 index 00000000000..0d04fd1eb70 --- /dev/null +++ b/components/raftstore-v2/src/worker/cleanup/mod.rs @@ -0,0 +1,42 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::fmt::{self, Display, Formatter}; + +pub use compact::{CompactThreshold, Runner as CompactRunner, Task as CompactTask}; +use engine_traits::KvEngine; +use tikv_util::worker::Runnable; + +mod compact; + +pub enum Task { + Compact(CompactTask), +} + +impl Display for Task { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + match self { + Task::Compact(ref t) => t.fmt(f), + } + } +} + +pub struct Runner { + compact: CompactRunner, + // todo: more cleanup related runner may be added later +} + +impl Runner { + pub fn new(compact: CompactRunner) -> Runner { + Runner { compact } + } +} + +impl Runnable for Runner { + type Task = Task; + + fn run(&mut self, task: Task) { + match task { + Task::Compact(t) => self.compact.run(t), + } + } +} diff --git a/components/raftstore-v2/src/worker/mod.rs b/components/raftstore-v2/src/worker/mod.rs index 2fa7255afd3..93ec453c030 100644 --- a/components/raftstore-v2/src/worker/mod.rs +++ b/components/raftstore-v2/src/worker/mod.rs @@ -1,4 +1,6 @@ // Copyright 2022 TiKV Project Authors. Licensed under Apache-2.0. +pub mod checkpoint; +pub mod cleanup; pub mod pd; pub mod tablet; diff --git a/components/raftstore/src/coprocessor/mod.rs b/components/raftstore/src/coprocessor/mod.rs index f5bdd8664e6..7c84b09ce7e 100644 --- a/components/raftstore/src/coprocessor/mod.rs +++ b/components/raftstore/src/coprocessor/mod.rs @@ -305,6 +305,7 @@ pub enum RegionChangeReason { CommitMerge, RollbackMerge, SwitchWitness, + Flashback, } #[derive(Clone, Copy, Debug, PartialEq)] diff --git a/components/raftstore/src/store/config.rs b/components/raftstore/src/store/config.rs index aabf173e674..ecdfbe85d3f 100644 --- a/components/raftstore/src/store/config.rs +++ b/components/raftstore/src/store/config.rs @@ -103,12 +103,17 @@ pub struct Config { /// Interval (ms) to check whether start compaction for a region. pub region_compact_check_interval: ReadableDuration, /// Number of regions for each time checking. - pub region_compact_check_step: u64, + pub region_compact_check_step: Option, /// Minimum number of tombstones to trigger manual compaction. pub region_compact_min_tombstones: u64, /// Minimum percentage of tombstones to trigger manual compaction. /// Should between 1 and 100. pub region_compact_tombstones_percent: u64, + /// Minimum number of redundant rows to trigger manual compaction. + pub region_compact_min_redundant_rows: u64, + /// Minimum percentage of redundant rows to trigger manual compaction. + /// Should between 1 and 100. + pub region_compact_redundant_rows_percent: u64, pub pd_heartbeat_tick_interval: ReadableDuration, pub pd_store_heartbeat_tick_interval: ReadableDuration, pub snap_mgr_gc_tick_interval: ReadableDuration, @@ -371,9 +376,11 @@ impl Default for Config { split_region_check_tick_interval: ReadableDuration::secs(10), region_split_check_diff: None, region_compact_check_interval: ReadableDuration::minutes(5), - region_compact_check_step: 100, + region_compact_check_step: None, region_compact_min_tombstones: 10000, region_compact_tombstones_percent: 30, + region_compact_min_redundant_rows: 50000, + region_compact_redundant_rows_percent: 20, pd_heartbeat_tick_interval: ReadableDuration::minutes(1), pd_store_heartbeat_tick_interval: ReadableDuration::secs(10), notify_capacity: 40960, @@ -513,6 +520,10 @@ impl Config { self.raft_log_gc_size_limit.unwrap() } + pub fn region_compact_check_step(&self) -> u64 { + self.region_compact_check_step.unwrap() + } + #[inline] pub fn warmup_entry_cache_enabled(&self) -> bool { self.max_entry_cache_warmup_duration.0 != Duration::from_secs(0) @@ -532,6 +543,16 @@ impl Config { false } + pub fn optimize_for(&mut self, raft_kv_v2: bool) { + if self.region_compact_check_step.is_none() { + if raft_kv_v2 { + self.region_compact_check_step = Some(5); + } else { + self.region_compact_check_step = Some(100); + } + } + } + pub fn validate( &mut self, region_split_size: ReadableSize, @@ -799,6 +820,7 @@ impl Config { } } } + assert!(self.region_compact_check_step.is_some()); Ok(()) } @@ -875,13 +897,19 @@ impl Config { .set(self.region_compact_check_interval.as_secs_f64()); CONFIG_RAFTSTORE_GAUGE .with_label_values(&["region_compact_check_step"]) - .set(self.region_compact_check_step as f64); + .set(self.region_compact_check_step.unwrap_or_default() as f64); CONFIG_RAFTSTORE_GAUGE .with_label_values(&["region_compact_min_tombstones"]) .set(self.region_compact_min_tombstones as f64); CONFIG_RAFTSTORE_GAUGE .with_label_values(&["region_compact_tombstones_percent"]) .set(self.region_compact_tombstones_percent as f64); + CONFIG_RAFTSTORE_GAUGE + .with_label_values(&["region_compact_min_redundant_rows"]) + .set(self.region_compact_min_redundant_rows as f64); + CONFIG_RAFTSTORE_GAUGE + .with_label_values(&["region_compact_tombstones_percent"]) + .set(self.region_compact_tombstones_percent as f64); CONFIG_RAFTSTORE_GAUGE .with_label_values(&["pd_heartbeat_tick_interval"]) .set(self.pd_heartbeat_tick_interval.as_secs_f64()); @@ -1120,6 +1148,7 @@ mod tests { fn test_config_validate() { let split_size = coprocessor::config::SPLIT_SIZE; let mut cfg = Config::new(); + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)).unwrap(); assert_eq!( cfg.raft_min_election_timeout_ticks, @@ -1131,41 +1160,50 @@ mod tests { ); cfg.raft_heartbeat_ticks = 0; + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg = Config::new(); cfg.raft_election_timeout_ticks = 10; cfg.raft_heartbeat_ticks = 10; + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg = Config::new(); cfg.raft_min_election_timeout_ticks = 5; + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg.raft_min_election_timeout_ticks = 25; + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg.raft_min_election_timeout_ticks = 10; + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)).unwrap(); cfg.raft_heartbeat_ticks = 11; + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg = Config::new(); cfg.raft_log_gc_threshold = 0; + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg = Config::new(); cfg.raft_log_gc_size_limit = Some(ReadableSize(0)); + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg = Config::new(); cfg.raft_log_gc_size_limit = None; + cfg.optimize_for(false); cfg.validate(ReadableSize(20), false, ReadableSize(0)) .unwrap(); assert_eq!(cfg.raft_log_gc_size_limit, Some(ReadableSize(15))); @@ -1174,23 +1212,27 @@ mod tests { cfg.raft_base_tick_interval = ReadableDuration::secs(1); cfg.raft_election_timeout_ticks = 10; cfg.raft_store_max_leader_lease = ReadableDuration::secs(20); + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg = Config::new(); cfg.raft_log_gc_count_limit = Some(100); cfg.merge_max_log_gap = 110; + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg = Config::new(); cfg.raft_log_gc_count_limit = None; + cfg.optimize_for(false); cfg.validate(ReadableSize::mb(1), false, ReadableSize(0)) .unwrap(); assert_eq!(cfg.raft_log_gc_count_limit, Some(768)); cfg = Config::new(); cfg.merge_check_tick_interval = ReadableDuration::secs(0); + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); @@ -1198,64 +1240,76 @@ mod tests { cfg.raft_base_tick_interval = ReadableDuration::secs(1); cfg.raft_election_timeout_ticks = 10; cfg.peer_stale_state_check_interval = ReadableDuration::secs(5); + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg = Config::new(); cfg.peer_stale_state_check_interval = ReadableDuration::minutes(2); cfg.abnormal_leader_missing_duration = ReadableDuration::minutes(1); + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg = Config::new(); cfg.abnormal_leader_missing_duration = ReadableDuration::minutes(2); cfg.max_leader_missing_duration = ReadableDuration::minutes(1); + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg = Config::new(); cfg.local_read_batch_size = 0; + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg = Config::new(); cfg.apply_batch_system.max_batch_size = Some(0); + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg = Config::new(); cfg.apply_batch_system.pool_size = 0; + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg = Config::new(); cfg.store_batch_system.max_batch_size = Some(0); + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg = Config::new(); cfg.store_batch_system.pool_size = 0; + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg = Config::new(); cfg.apply_batch_system.max_batch_size = Some(10241); + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg = Config::new(); cfg.store_batch_system.max_batch_size = Some(10241); + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg = Config::new(); cfg.hibernate_regions = true; + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)).unwrap(); assert_eq!(cfg.store_batch_system.max_batch_size, Some(256)); assert_eq!(cfg.apply_batch_system.max_batch_size, Some(256)); cfg = Config::new(); cfg.hibernate_regions = false; + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)).unwrap(); assert_eq!(cfg.store_batch_system.max_batch_size, Some(1024)); assert_eq!(cfg.apply_batch_system.max_batch_size, Some(256)); @@ -1264,17 +1318,20 @@ mod tests { cfg.hibernate_regions = true; cfg.store_batch_system.max_batch_size = Some(123); cfg.apply_batch_system.max_batch_size = Some(234); + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)).unwrap(); assert_eq!(cfg.store_batch_system.max_batch_size, Some(123)); assert_eq!(cfg.apply_batch_system.max_batch_size, Some(234)); cfg = Config::new(); cfg.future_poll_size = 0; + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg = Config::new(); cfg.snap_generator_pool_size = 0; + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); @@ -1282,6 +1339,7 @@ mod tests { cfg.raft_base_tick_interval = ReadableDuration::secs(1); cfg.raft_election_timeout_ticks = 11; cfg.raft_store_max_leader_lease = ReadableDuration::secs(11); + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); @@ -1289,43 +1347,54 @@ mod tests { cfg.hibernate_regions = true; cfg.max_peer_down_duration = ReadableDuration::minutes(5); cfg.peer_stale_state_check_interval = ReadableDuration::minutes(5); + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)).unwrap(); assert_eq!(cfg.max_peer_down_duration, ReadableDuration::minutes(10)); cfg = Config::new(); cfg.raft_max_size_per_msg = ReadableSize(0); + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg.raft_max_size_per_msg = ReadableSize::gb(64); + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg.raft_max_size_per_msg = ReadableSize::gb(3); + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)).unwrap(); cfg = Config::new(); cfg.raft_entry_max_size = ReadableSize(0); + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg.raft_entry_max_size = ReadableSize::mb(3073); + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)) .unwrap_err(); cfg.raft_entry_max_size = ReadableSize::gb(3); + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)).unwrap(); cfg = Config::new(); + cfg.optimize_for(false); cfg.validate(split_size, false, ReadableSize(0)).unwrap(); assert_eq!(cfg.region_split_check_diff(), split_size / 16); cfg = Config::new(); + cfg.optimize_for(false); cfg.validate(split_size, true, split_size / 8).unwrap(); assert_eq!(cfg.region_split_check_diff(), split_size / 16); cfg = Config::new(); + cfg.optimize_for(false); cfg.validate(split_size, true, split_size / 20).unwrap(); assert_eq!(cfg.region_split_check_diff(), split_size / 20); cfg = Config::new(); cfg.region_split_check_diff = Some(ReadableSize(1)); + cfg.optimize_for(false); cfg.validate(split_size, true, split_size / 20).unwrap(); assert_eq!(cfg.region_split_check_diff(), ReadableSize(1)); } diff --git a/components/raftstore/src/store/fsm/apply.rs b/components/raftstore/src/store/fsm/apply.rs index d1ba6d4e774..1ef7bd843ee 100644 --- a/components/raftstore/src/store/fsm/apply.rs +++ b/components/raftstore/src/store/fsm/apply.rs @@ -306,7 +306,7 @@ pub enum ExecResult { TransferLeader { term: u64, }, - SetFlashbackState { + Flashback { region: Region, }, BatchSwitchWitness(SwitchWitness), @@ -1496,7 +1496,7 @@ where ExecResult::CommitMerge { ref region, .. } => (Some(region.clone()), None), ExecResult::RollbackMerge { ref region, .. } => (Some(region.clone()), None), ExecResult::IngestSst { ref ssts } => (None, Some(ssts.clone())), - ExecResult::SetFlashbackState { ref region } => (Some(region.clone()), None), + ExecResult::Flashback { ref region } => (Some(region.clone()), None), _ => (None, None), }, _ => (None, None), @@ -1565,7 +1565,7 @@ where self.region = region.clone(); self.is_merging = false; } - ExecResult::SetFlashbackState { ref region } => { + ExecResult::Flashback { ref region } => { self.region = region.clone(); } ExecResult::BatchSwitchWitness(ref switches) => { @@ -1662,10 +1662,13 @@ where let include_region = req.get_header().get_region_epoch().get_version() >= self.last_merge_version; check_req_region_epoch(req, &self.region, include_region)?; + let header = req.get_header(); + let admin_type = req.admin_request.as_ref().map(|req| req.get_cmd_type()); check_flashback_state( self.region.is_in_flashback, self.region.flashback_start_ts, - req, + header, + admin_type, self.region_id(), false, )?; @@ -2986,31 +2989,37 @@ where ctx: &mut ApplyContext, req: &AdminRequest, ) -> Result<(AdminResponse, ApplyResult)> { - let is_in_flashback = req.get_cmd_type() == AdminCmdType::PrepareFlashback; - // Modify the region meta in memory. + // Modify flashback fields in region state. let mut region = self.region.clone(); - region.set_is_in_flashback(is_in_flashback); - region.set_flashback_start_ts(req.get_prepare_flashback().get_start_ts()); - // Modify the `RegionLocalState` persisted in disk. - write_peer_state(ctx.kv_wb_mut(), ®ion, PeerState::Normal, None).unwrap_or_else(|e| { - panic!( - "{} failed to change the flashback state to {} for region {:?}: {:?}", - self.tag, is_in_flashback, region, e - ) - }); - match req.get_cmd_type() { AdminCmdType::PrepareFlashback => { PEER_ADMIN_CMD_COUNTER.prepare_flashback.success.inc(); + + region.set_is_in_flashback(true); + region.set_flashback_start_ts(req.get_prepare_flashback().get_start_ts()); } AdminCmdType::FinishFlashback => { PEER_ADMIN_CMD_COUNTER.finish_flashback.success.inc(); + + region.set_is_in_flashback(false); + region.clear_flashback_start_ts(); } _ => unreachable!(), } + + // Modify the `RegionLocalState` persisted in disk. + write_peer_state(ctx.kv_wb_mut(), ®ion, PeerState::Normal, None).unwrap_or_else(|e| { + panic!( + "{} failed to change the flashback state to {:?} for region {:?}: {:?}", + self.tag, + req.get_cmd_type(), + region, + e + ) + }); Ok(( AdminResponse::default(), - ApplyResult::Res(ExecResult::SetFlashbackState { region }), + ApplyResult::Res(ExecResult::Flashback { region }), )) } diff --git a/components/raftstore/src/store/fsm/peer.rs b/components/raftstore/src/store/fsm/peer.rs index 72eb3c59753..910a08c3a0b 100644 --- a/components/raftstore/src/store/fsm/peer.rs +++ b/components/raftstore/src/store/fsm/peer.rs @@ -5012,7 +5012,7 @@ where } ExecResult::IngestSst { ssts } => self.on_ingest_sst_result(ssts), ExecResult::TransferLeader { term } => self.on_transfer_leader(term), - ExecResult::SetFlashbackState { region } => self.on_set_flashback_state(region), + ExecResult::Flashback { region } => self.on_set_flashback_state(region), ExecResult::BatchSwitchWitness(switches) => { self.on_ready_batch_switch_witness(switches) } @@ -5260,10 +5260,13 @@ where // the apply phase and because a read-only request doesn't need to be applied, // so it will be allowed during the flashback progress, for example, a snapshot // request. + let header = msg.get_header(); + let admin_type = msg.admin_request.as_ref().map(|req| req.get_cmd_type()); if let Err(e) = util::check_flashback_state( self.region().is_in_flashback, self.region().flashback_start_ts, - msg, + header, + admin_type, region_id, true, ) { @@ -5280,7 +5283,7 @@ where .invalid_proposal .flashback_not_prepared .inc(), - _ => unreachable!(), + _ => unreachable!("{:?}", e), } return Err(e); } diff --git a/components/raftstore/src/store/fsm/store.rs b/components/raftstore/src/store/fsm/store.rs index 2cdacd9d3e9..9a7df9d5473 100644 --- a/components/raftstore/src/store/fsm/store.rs +++ b/components/raftstore/src/store/fsm/store.rs @@ -679,7 +679,12 @@ struct Store { stopped: bool, start_time: Option, consistency_check_time: HashMap, - last_unreachable_report: HashMap, + store_reachability: HashMap, +} + +struct StoreReachability { + last_broadcast: Instant, + received_message_count: u64, } pub struct StoreFsm @@ -703,7 +708,7 @@ where stopped: false, start_time: None, consistency_check_time: HashMap::default(), - last_unreachable_report: HashMap::default(), + store_reachability: HashMap::default(), }, receiver: rx, }); @@ -2415,7 +2420,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER // Start from last checked key. let mut ranges_need_check = - Vec::with_capacity(self.ctx.cfg.region_compact_check_step as usize + 1); + Vec::with_capacity(self.ctx.cfg.region_compact_check_step() as usize + 1); ranges_need_check.push(self.fsm.store.last_compact_checked_key.clone()); let largest_key = { @@ -2435,7 +2440,7 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER )); ranges_need_check.extend( left_ranges - .take(self.ctx.cfg.region_compact_check_step as usize) + .take(self.ctx.cfg.region_compact_check_step() as usize) .map(|(k, _)| k.to_owned()), ); @@ -2894,22 +2899,35 @@ impl<'a, EK: KvEngine, ER: RaftEngine, T: Transport> StoreFsmDelegate<'a, EK, ER fn on_store_unreachable(&mut self, store_id: u64) { let now = Instant::now(); let unreachable_backoff = self.ctx.cfg.unreachable_backoff.0; - if self - .fsm - .store - .last_unreachable_report - .get(&store_id) - .map_or(unreachable_backoff, |t| now.saturating_duration_since(*t)) - < unreachable_backoff - { - return; - } + let new_messages = MESSAGE_RECV_BY_STORE + .with_label_values(&[&format!("{}", store_id)]) + .get(); + match self.fsm.store.store_reachability.entry(store_id) { + HashMapEntry::Vacant(x) => { + x.insert(StoreReachability { + last_broadcast: now, + received_message_count: new_messages, + }); + } + HashMapEntry::Occupied(x) => { + let ob = x.into_mut(); + if now.saturating_duration_since(ob.last_broadcast) < unreachable_backoff + // If there are no new messages come from `store_id`, it's not + // necessary to do redundant broadcasts. + || (new_messages <= ob.received_message_count && new_messages > 0) + { + return; + } + ob.last_broadcast = now; + ob.received_message_count = new_messages; + } + }; + info!( "broadcasting unreachable"; "store_id" => self.fsm.store.id, "unreachable_store_id" => store_id, ); - self.fsm.store.last_unreachable_report.insert(store_id, now); // It's possible to acquire the lock and only send notification to // involved regions. However loop over all the regions can take a // lot of time, which may block other operations. diff --git a/components/raftstore/src/store/local_metrics.rs b/components/raftstore/src/store/local_metrics.rs index 0e6a09cbf0b..baf63814416 100644 --- a/components/raftstore/src/store/local_metrics.rs +++ b/components/raftstore/src/store/local_metrics.rs @@ -246,11 +246,11 @@ impl TimeTracker { now: std::time::Instant, local_metric: &LocalHistogram, tracker_metric: impl FnOnce(&mut Tracker) -> &mut u64, - ) { + ) -> u64 { let dur = now.saturating_duration_since(self.start); local_metric.observe(dur.as_secs_f64()); if self.token == INVALID_TRACKER_TOKEN { - return; + return 0; } GLOBAL_TRACKERS.with_tracker(self.token, |tracker| { let metric = tracker_metric(tracker); @@ -258,6 +258,7 @@ impl TimeTracker { *metric = dur.as_nanos() as u64; } }); + dur.as_nanos() as u64 } #[inline] diff --git a/components/raftstore/src/store/metrics.rs b/components/raftstore/src/store/metrics.rs index 7df8819c998..c69875ae998 100644 --- a/components/raftstore/src/store/metrics.rs +++ b/components/raftstore/src/store/metrics.rs @@ -868,4 +868,11 @@ lazy_static! { "Total snapshot generate limit used", ) .unwrap(); + + pub static ref MESSAGE_RECV_BY_STORE: IntCounterVec = register_int_counter_vec!( + "tikv_raftstore_message_recv_by_store", + "Messages received by store", + &["store"] + ) + .unwrap(); } diff --git a/components/raftstore/src/store/mod.rs b/components/raftstore/src/store/mod.rs index c007b622ee1..7a2c04e2450 100644 --- a/components/raftstore/src/store/mod.rs +++ b/components/raftstore/src/store/mod.rs @@ -84,6 +84,8 @@ pub use self::{ ReadExecutor, ReadExecutorProvider, ReadProgress, ReadStats, RefreshConfigTask, RegionTask, SplitCheckRunner, SplitCheckTask, SplitConfig, SplitConfigManager, SplitInfo, StoreMetaDelegate, StoreStatsReporter, TrackVer, WriteStats, - NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, + BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO, DEFAULT_BIG_REGION_BYTE_THRESHOLD, + DEFAULT_BIG_REGION_QPS_THRESHOLD, DEFAULT_BYTE_THRESHOLD, DEFAULT_QPS_THRESHOLD, + NUM_COLLECT_STORE_INFOS_PER_HEARTBEAT, REGION_CPU_OVERLOAD_THRESHOLD_RATIO, }, }; diff --git a/components/raftstore/src/store/peer.rs b/components/raftstore/src/store/peer.rs index 8dc69a0def4..f21fccddff5 100644 --- a/components/raftstore/src/store/peer.rs +++ b/components/raftstore/src/store/peer.rs @@ -27,7 +27,7 @@ use fail::fail_point; use getset::{Getters, MutGetters}; use kvproto::{ errorpb, - kvrpcpb::{DiskFullOpt, ExtraOp as TxnExtraOp, LockInfo}, + kvrpcpb::{DiskFullOpt, ExtraOp as TxnExtraOp}, metapb::{self, PeerRole}, pdpb::{self, PeerStats}, raft_cmdpb::{ @@ -571,13 +571,12 @@ pub fn start_unsafe_recovery_report( pub fn propose_read_index( raft_group: &mut RawNode, request: Option<&raft_cmdpb::ReadIndexRequest>, - locked: Option<&LockInfo>, ) -> (Uuid, bool) { let last_pending_read_count = raft_group.raft.pending_read_count(); let last_ready_read_count = raft_group.raft.ready_read_count(); let id = Uuid::new_v4(); - raft_group.read_index(ReadIndexContext::fields_to_bytes(id, request, locked)); + raft_group.read_index(ReadIndexContext::fields_to_bytes(id, request, None)); let pending_read_count = raft_group.raft.pending_read_count(); let ready_read_count = raft_group.raft.ready_read_count(); @@ -3614,7 +3613,7 @@ where let term = self.term(); self.leader_lease .maybe_new_remote_lease(term) - .map(ReadProgress::leader_lease) + .map(ReadProgress::set_leader_lease) }; if let Some(progress) = progress { let mut meta = ctx.store_meta.lock().unwrap(); @@ -4051,7 +4050,7 @@ where .get_mut(0) .filter(|req| req.has_read_index()) .map(|req| req.take_read_index()); - let (id, dropped) = self.propose_read_index(request.as_ref(), None); + let (id, dropped) = self.propose_read_index(request.as_ref()); if dropped && self.is_leader() { // The message gets dropped silently, can't be handled anymore. apply::notify_stale_req(self.term(), cb); @@ -4098,9 +4097,8 @@ where pub fn propose_read_index( &mut self, request: Option<&raft_cmdpb::ReadIndexRequest>, - locked: Option<&LockInfo>, ) -> (Uuid, bool) { - propose_read_index(&mut self.raft_group, request, locked) + propose_read_index(&mut self.raft_group, request) } /// Returns (minimal matched, minimal committed_index) diff --git a/components/raftstore/src/store/snap.rs b/components/raftstore/src/store/snap.rs index 036d6f0a812..62744501195 100644 --- a/components/raftstore/src/store/snap.rs +++ b/components/raftstore/src/store/snap.rs @@ -2075,6 +2075,8 @@ pub struct TabletSnapManager { key_manager: Option>, receiving: Arc>>, stats: Arc>>, + sending_count: Arc, + recving_count: Arc, } impl TabletSnapManager { @@ -2099,6 +2101,8 @@ impl TabletSnapManager { key_manager, receiving: Arc::default(), stats: Arc::default(), + sending_count: Arc::default(), + recving_count: Arc::default(), }) } @@ -2131,8 +2135,8 @@ impl TabletSnapManager { .filter(|stat| stat.get_total_duration_sec() > 1) .collect(); SnapStats { - sending_count: 0, - receiving_count: 0, + sending_count: self.sending_count.load(Ordering::SeqCst), + receiving_count: self.recving_count.load(Ordering::SeqCst), stats, } } @@ -2218,6 +2222,14 @@ impl TabletSnapManager { }) } + pub fn sending_count(&self) -> &Arc { + &self.sending_count + } + + pub fn recving_count(&self) -> &Arc { + &self.recving_count + } + #[inline] pub fn key_manager(&self) -> &Option> { &self.key_manager diff --git a/components/raftstore/src/store/util.rs b/components/raftstore/src/store/util.rs index f5a23538ad5..539dfa22403 100644 --- a/components/raftstore/src/store/util.rs +++ b/components/raftstore/src/store/util.rs @@ -344,27 +344,28 @@ pub fn compare_region_epoch( pub fn check_flashback_state( is_in_flashback: bool, flashback_start_ts: u64, - req: &RaftCmdRequest, + header: &RaftRequestHeader, + admin_type: Option, region_id: u64, skip_not_prepared: bool, ) -> Result<()> { // The admin flashback cmd could be proposed/applied under any state. - if req.has_admin_request() - && (req.get_admin_request().get_cmd_type() == AdminCmdType::PrepareFlashback - || req.get_admin_request().get_cmd_type() == AdminCmdType::FinishFlashback) + if let Some(ty) = admin_type + && (ty == AdminCmdType::PrepareFlashback + || ty == AdminCmdType::FinishFlashback) { return Ok(()); } // TODO: only use `flashback_start_ts` to check flashback state. let is_in_flashback = is_in_flashback || flashback_start_ts > 0; - let is_flashback_request = WriteBatchFlags::from_bits_truncate(req.get_header().get_flags()) + let is_flashback_request = WriteBatchFlags::from_bits_truncate(header.get_flags()) .contains(WriteBatchFlags::FLASHBACK); // If the region is in the flashback state: // - A request with flashback flag will be allowed. // - A read request whose `read_ts` is smaller than `flashback_start_ts` will // be allowed. if is_in_flashback && !is_flashback_request { - if let Ok(read_ts) = decode_u64(&mut req.get_header().get_flag_data()) { + if let Ok(read_ts) = decode_u64(&mut header.get_flag_data()) { if read_ts != 0 && read_ts < flashback_start_ts { return Ok(()); } diff --git a/components/raftstore/src/store/worker/cleanup_sst.rs b/components/raftstore/src/store/worker/cleanup_sst.rs index 5e58ab77b63..8174b872f4b 100644 --- a/components/raftstore/src/store/worker/cleanup_sst.rs +++ b/components/raftstore/src/store/worker/cleanup_sst.rs @@ -1,15 +1,17 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. -use std::{fmt, marker::PhantomData, sync::Arc}; +use std::{error::Error, fmt, marker::PhantomData, sync::Arc}; use engine_traits::KvEngine; -use kvproto::import_sstpb::SstMeta; +use kvproto::{import_sstpb::SstMeta, metapb::Region}; use pd_client::PdClient; use sst_importer::SstImporter; use tikv_util::{error, worker::Runnable}; use crate::store::{util::is_epoch_stale, StoreMsg, StoreRouter}; +type Result = std::result::Result>; + pub enum Task { DeleteSst { ssts: Vec }, ValidateSst { ssts: Vec }, @@ -64,12 +66,39 @@ where } } + fn get_region_by_meta(&self, sst: &SstMeta) -> Result { + // The SST meta has been delivered with a range, use it directly. + // For now, no case will reach this. But this still could be a guard for + // reducing the superise in the future... + if !sst.get_range().get_start().is_empty() || !sst.get_range().get_end().is_empty() { + return self + .pd_client + .get_region(sst.get_range().get_start()) + .map_err(Into::into); + } + // Once there isn't range provided. + let query_by_start_key_of_full_meta = || { + let start_key = self + .importer + .load_start_key_by_meta::(sst)? + .ok_or_else(|| -> Box { + "failed to load start key from sst, the sst might be empty".into() + })?; + let region = self.pd_client.get_region(&start_key)?; + Result::Ok(region) + }; + query_by_start_key_of_full_meta() + .map_err(|err| + format!("failed to load full sst meta from disk for {:?} and there isn't extra information provided: {err}", sst.get_uuid()).into() + ) + } + /// Validates whether the SST is stale or not. fn handle_validate_sst(&self, ssts: Vec) { let store_id = self.store_id; let mut invalid_ssts = Vec::new(); for sst in ssts { - match self.pd_client.get_region(sst.get_range().get_start()) { + match self.get_region_by_meta(&sst) { Ok(r) => { // The region id may or may not be the same as the // SST file, but it doesn't matter, because the @@ -87,7 +116,7 @@ where invalid_ssts.push(sst); } Err(e) => { - error!(%e; "get region failed"); + error!("get region failed"; "err" => %e); } } } diff --git a/components/raftstore/src/store/worker/compact.rs b/components/raftstore/src/store/worker/compact.rs index 7bc7052b277..4448e26a5b3 100644 --- a/components/raftstore/src/store/worker/compact.rs +++ b/components/raftstore/src/store/worker/compact.rs @@ -206,12 +206,11 @@ fn collect_ranges_need_compact( for range in ranges.windows(2) { // Get total entries and total versions in this range and checks if it needs to // be compacted. - if let Some((num_ent, num_ver)) = - box_try!(engine.get_range_entries_and_versions(CF_WRITE, &range[0], &range[1])) + if let Some(range_stats) = box_try!(engine.get_range_stats(CF_WRITE, &range[0], &range[1])) { if need_compact( - num_ent, - num_ver, + range_stats.num_entries, + range_stats.num_versions, tombstones_num_threshold, tombstones_percent_threshold, ) { @@ -357,12 +356,12 @@ mod tests { engine.flush_cf(CF_WRITE, true).unwrap(); let (start, end) = (data_key(b"k0"), data_key(b"k5")); - let (entries, version) = engine - .get_range_entries_and_versions(CF_WRITE, &start, &end) + let range_stats = engine + .get_range_stats(CF_WRITE, &start, &end) .unwrap() .unwrap(); - assert_eq!(entries, 10); - assert_eq!(version, 5); + assert_eq!(range_stats.num_entries, 10); + assert_eq!(range_stats.num_versions, 5); // mvcc_put 5..10 for i in 5..10 { @@ -372,12 +371,9 @@ mod tests { engine.flush_cf(CF_WRITE, true).unwrap(); let (s, e) = (data_key(b"k5"), data_key(b"k9")); - let (entries, version) = engine - .get_range_entries_and_versions(CF_WRITE, &s, &e) - .unwrap() - .unwrap(); - assert_eq!(entries, 5); - assert_eq!(version, 5); + let range_stats = engine.get_range_stats(CF_WRITE, &s, &e).unwrap().unwrap(); + assert_eq!(range_stats.num_entries, 5); + assert_eq!(range_stats.num_versions, 5); let ranges_need_to_compact = collect_ranges_need_compact( &engine, @@ -399,12 +395,9 @@ mod tests { engine.flush_cf(CF_WRITE, true).unwrap(); let (s, e) = (data_key(b"k5"), data_key(b"k9")); - let (entries, version) = engine - .get_range_entries_and_versions(CF_WRITE, &s, &e) - .unwrap() - .unwrap(); - assert_eq!(entries, 10); - assert_eq!(version, 5); + let range_stats = engine.get_range_stats(CF_WRITE, &s, &e).unwrap().unwrap(); + assert_eq!(range_stats.num_entries, 10); + assert_eq!(range_stats.num_versions, 5); let ranges_need_to_compact = collect_ranges_need_compact( &engine, diff --git a/components/raftstore/src/store/worker/metrics.rs b/components/raftstore/src/store/worker/metrics.rs index 36a217be607..2ad06d9c69d 100644 --- a/components/raftstore/src/store/worker/metrics.rs +++ b/components/raftstore/src/store/worker/metrics.rs @@ -70,6 +70,7 @@ make_static_metric! { pub struct LocalReadMetrics { pub local_executed_requests: LocalIntCounter, pub local_executed_stale_read_requests: LocalIntCounter, + pub local_executed_replica_read_requests: LocalIntCounter, pub local_executed_snapshot_cache_hit: LocalIntCounter, pub reject_reason: LocalReadRejectCounter, pub renew_lease_advance: LocalIntCounter, @@ -81,6 +82,7 @@ thread_local! { LocalReadMetrics { local_executed_requests: LOCAL_READ_EXECUTED_REQUESTS.local(), local_executed_stale_read_requests: LOCAL_READ_EXECUTED_STALE_READ_REQUESTS.local(), + local_executed_replica_read_requests: LOCAL_READ_EXECUTED_REPLICA_READ_REQUESTS.local(), local_executed_snapshot_cache_hit: LOCAL_READ_EXECUTED_CACHE_REQUESTS.local(), reject_reason: LocalReadRejectCounter::from(&LOCAL_READ_REJECT_VEC), renew_lease_advance: LOCAL_READ_RENEW_LEASE_ADVANCE_COUNTER.local(), @@ -98,6 +100,7 @@ pub fn maybe_tls_local_read_metrics_flush() { if m.last_flush_time.saturating_elapsed() >= Duration::from_millis(METRICS_FLUSH_INTERVAL) { m.local_executed_requests.flush(); m.local_executed_stale_read_requests.flush(); + m.local_executed_replica_read_requests.flush(); m.local_executed_snapshot_cache_hit.flush(); m.reject_reason.flush(); m.renew_lease_advance.flush(); @@ -180,6 +183,11 @@ lazy_static! { "Total number of stale read requests directly executed by local reader." ) .unwrap(); + pub static ref LOCAL_READ_EXECUTED_REPLICA_READ_REQUESTS: IntCounter = register_int_counter!( + "tikv_raftstore_local_read_executed_replica_read_requests", + "Total number of stale read requests directly executed by local reader." + ) + .unwrap(); pub static ref RAFT_LOG_GC_WRITE_DURATION_HISTOGRAM: Histogram = register_histogram!( "tikv_raftstore_raft_log_gc_write_duration_secs", "Bucketed histogram of write duration of raft log gc.", diff --git a/components/raftstore/src/store/worker/mod.rs b/components/raftstore/src/store/worker/mod.rs index ac23f4e58d5..62d27b2e88b 100644 --- a/components/raftstore/src/store/worker/mod.rs +++ b/components/raftstore/src/store/worker/mod.rs @@ -44,6 +44,10 @@ pub use self::{ split_check::{ Bucket, BucketRange, KeyEntry, Runner as SplitCheckRunner, Task as SplitCheckTask, }, - split_config::{SplitConfig, SplitConfigManager}, + split_config::{ + SplitConfig, SplitConfigManager, BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO, + DEFAULT_BIG_REGION_BYTE_THRESHOLD, DEFAULT_BIG_REGION_QPS_THRESHOLD, + DEFAULT_BYTE_THRESHOLD, DEFAULT_QPS_THRESHOLD, REGION_CPU_OVERLOAD_THRESHOLD_RATIO, + }, split_controller::{AutoSplitController, ReadStats, SplitConfigChange, SplitInfo, WriteStats}, }; diff --git a/components/raftstore/src/store/worker/read.rs b/components/raftstore/src/store/worker/read.rs index 022bd457cd5..488d24ac134 100644 --- a/components/raftstore/src/store/worker/read.rs +++ b/components/raftstore/src/store/worker/read.rs @@ -496,7 +496,7 @@ impl ReadDelegate { self.applied_term = applied_term; } Progress::LeaderLease(leader_lease) => { - self.leader_lease = Some(leader_lease); + self.leader_lease = leader_lease; } Progress::RegionBuckets(bucket_meta) => { self.bucket_meta = Some(bucket_meta); @@ -631,7 +631,7 @@ pub enum Progress { Region(metapb::Region), Term(u64), AppliedTerm(u64), - LeaderLease(RemoteLease), + LeaderLease(Option), RegionBuckets(Arc), WaitData(bool), } @@ -649,8 +649,12 @@ impl Progress { Progress::AppliedTerm(applied_term) } - pub fn leader_lease(lease: RemoteLease) -> Progress { - Progress::LeaderLease(lease) + pub fn set_leader_lease(lease: RemoteLease) -> Progress { + Progress::LeaderLease(Some(lease)) + } + + pub fn unset_leader_lease() -> Progress { + Progress::LeaderLease(None) } pub fn region_buckets(bucket_meta: Arc) -> Progress { @@ -828,20 +832,32 @@ where // be performed. let is_in_flashback = delegate.region.is_in_flashback; let flashback_start_ts = delegate.region.flashback_start_ts; - if let Err(e) = - util::check_flashback_state(is_in_flashback, flashback_start_ts, req, region_id, false) - { - TLS_LOCAL_READ_METRICS.with(|m| match e { + let header = req.get_header(); + let admin_type = req.admin_request.as_ref().map(|req| req.get_cmd_type()); + if let Err(e) = util::check_flashback_state( + is_in_flashback, + flashback_start_ts, + header, + admin_type, + region_id, + true, + ) { + debug!("rejected by flashback state"; + "error" => ?e, + "is_in_flashback" => is_in_flashback, + "tag" => &delegate.tag); + match e { Error::FlashbackNotPrepared(_) => { - m.borrow_mut().reject_reason.flashback_not_prepared.inc() + TLS_LOCAL_READ_METRICS + .with(|m| m.borrow_mut().reject_reason.flashback_not_prepared.inc()); } Error::FlashbackInProgress(..) => { - m.borrow_mut().reject_reason.flashback_in_progress.inc() + TLS_LOCAL_READ_METRICS + .with(|m| m.borrow_mut().reject_reason.flashback_in_progress.inc()); } - _ => unreachable!(), - }); - debug!("rejected by flashback state"; "is_in_flashback" => is_in_flashback, "tag" => &delegate.tag); - return Ok(None); + _ => unreachable!("{:?}", e), + }; + return Err(e); } Ok(Some(delegate)) @@ -1536,7 +1552,7 @@ mod tests { cmd.mut_header().set_term(term6 + 3); lease.expire_remote_lease(); let remote_lease = lease.maybe_new_remote_lease(term6 + 3).unwrap(); - let pg = Progress::leader_lease(remote_lease); + let pg = Progress::set_leader_lease(remote_lease); { let mut meta = store_meta.lock().unwrap(); meta.readers.get_mut(&1).unwrap().update(pg); @@ -1668,7 +1684,7 @@ mod tests { { let mut lease = Lease::new(Duration::seconds(1), Duration::milliseconds(250)); // 1s is long enough. let remote = lease.maybe_new_remote_lease(3).unwrap(); - let pg = Progress::leader_lease(remote); + let pg = Progress::set_leader_lease(remote); let mut meta = store_meta.lock().unwrap(); meta.readers.get_mut(&1).unwrap().update(pg); } diff --git a/components/raftstore/src/store/worker/split_config.rs b/components/raftstore/src/store/worker/split_config.rs index 7857ae10d8e..8fec853bb00 100644 --- a/components/raftstore/src/store/worker/split_config.rs +++ b/components/raftstore/src/store/worker/split_config.rs @@ -6,13 +6,18 @@ use lazy_static::lazy_static; use online_config::{ConfigChange, ConfigManager, OnlineConfig}; use parking_lot::Mutex; use serde::{Deserialize, Serialize}; -use tikv_util::{config::VersionTrack, info}; +use tikv_util::{ + config::{ReadableSize, VersionTrack}, + info, +}; const DEFAULT_DETECT_TIMES: u64 = 10; const DEFAULT_SAMPLE_THRESHOLD: u64 = 100; pub(crate) const DEFAULT_SAMPLE_NUM: usize = 20; -const DEFAULT_QPS_THRESHOLD: usize = 3000; -const DEFAULT_BYTE_THRESHOLD: usize = 30 * 1024 * 1024; +pub const DEFAULT_QPS_THRESHOLD: usize = 3000; +pub const DEFAULT_BIG_REGION_QPS_THRESHOLD: usize = 7000; +pub const DEFAULT_BYTE_THRESHOLD: usize = 30 * 1024 * 1024; +pub const DEFAULT_BIG_REGION_BYTE_THRESHOLD: usize = 100 * 1024 * 1024; // We get balance score by // abs(sample.left-sample.right)/(sample.right+sample.left). It will be used to @@ -43,7 +48,8 @@ const DEFAULT_UNIFIED_READ_POOL_THREAD_CPU_OVERLOAD_THRESHOLD_RATIO: f64 = 0.8; // `REGION_CPU_OVERLOAD_THRESHOLD_RATIO` as a percentage of the Unified Read // Poll, it will be added into the hot region list and may be split later as the // top hot CPU region. -pub(crate) const REGION_CPU_OVERLOAD_THRESHOLD_RATIO: f64 = 0.25; +pub const REGION_CPU_OVERLOAD_THRESHOLD_RATIO: f64 = 0.25; +pub const BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO: f64 = 0.75; lazy_static! { static ref SPLIT_CONFIG: Mutex>>> = Mutex::new(None); @@ -134,6 +140,15 @@ impl SplitConfig { } Ok(()) } + + pub fn optimize_for(&mut self, region_size: ReadableSize) { + const LARGE_REGION_SIZE_IN_MB: u64 = 4096; + if region_size.as_mb() >= LARGE_REGION_SIZE_IN_MB { + self.qps_threshold = DEFAULT_BIG_REGION_QPS_THRESHOLD; + self.region_cpu_overload_threshold_ratio = BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO; + self.byte_threshold = DEFAULT_BIG_REGION_BYTE_THRESHOLD; + } + } } #[derive(Clone)] diff --git a/components/server/src/server.rs b/components/server/src/server.rs index 625e8d8a31b..228358d52a1 100644 --- a/components/server/src/server.rs +++ b/components/server/src/server.rs @@ -85,6 +85,7 @@ use tikv::{ }, server::{ config::{Config as ServerConfig, ServerConfigManager}, + debug::{Debugger, DebuggerImpl}, gc_worker::{AutoGcConfig, GcWorker}, lock_manager::LockManager, raftkv::ReplicaReadLockChecker, @@ -713,6 +714,7 @@ where let server_config = Arc::new(VersionTrack::new(self.core.config.server.clone())); + self.core.config.raft_store.optimize_for(false); self.core .config .raft_store @@ -1030,14 +1032,18 @@ where .unwrap() .register(tikv::config::Module::Import, Box::new(import_cfg_mgr)); + let mut debugger = DebuggerImpl::new( + engines.engines.clone(), + self.cfg_controller.as_ref().unwrap().clone(), + ); + debugger.set_kv_statistics(self.kv_statistics.clone()); + debugger.set_raft_statistics(self.raft_statistics.clone()); + // Debug service. let debug_service = DebugService::new( - engines.engines.clone(), - self.kv_statistics.clone(), - self.raft_statistics.clone(), + debugger, servers.server.get_debug_thread_pool().clone(), engines.engine.raft_extension(), - self.cfg_controller.as_ref().unwrap().clone(), ); if servers .server diff --git a/components/server/src/server2.rs b/components/server/src/server2.rs index 4d1a9f2daf6..da970a7e749 100644 --- a/components/server/src/server2.rs +++ b/components/server/src/server2.rs @@ -38,8 +38,9 @@ use grpcio::{EnvBuilder, Environment}; use grpcio_health::HealthService; use kvproto::{ brpb::create_backup, cdcpb_grpc::create_change_data, deadlock::create_deadlock, - diagnosticspb::create_diagnostics, import_sstpb_grpc::create_import_sst, kvrpcpb::ApiVersion, - logbackuppb::create_log_backup, resource_usage_agent::create_resource_metering_pub_sub, + debugpb_grpc::create_debug, diagnosticspb::create_diagnostics, + import_sstpb_grpc::create_import_sst, kvrpcpb::ApiVersion, logbackuppb::create_log_backup, + resource_usage_agent::create_resource_metering_pub_sub, }; use pd_client::{ meta_storage::{Checked, Sourced}, @@ -72,11 +73,13 @@ use tikv::{ }, server::{ config::{Config as ServerConfig, ServerConfigManager}, + debug::Debugger, + debug2::DebuggerImplV2, gc_worker::{AutoGcConfig, GcWorker}, lock_manager::LockManager, raftkv::ReplicaReadLockChecker, resolve, - service::DiagnosticsService, + service::{DebugService, DiagnosticsService}, status_server::StatusServer, KvEngineFactoryBuilder, NodeV2, RaftKv2, Server, CPU_CORES_QUOTA_GAUGE, GRPC_THREAD_PREFIX, }, @@ -679,6 +682,7 @@ where let server_config = Arc::new(VersionTrack::new(self.core.config.server.clone())); + self.core.config.raft_store.optimize_for(true); self.core .config .raft_store @@ -906,6 +910,28 @@ where .unwrap() .register(tikv::config::Module::Import, Box::new(import_cfg_mgr)); + let mut debugger = DebuggerImplV2::new( + self.tablet_registry.clone().unwrap(), + self.engines.as_ref().unwrap().raft_engine.clone(), + self.cfg_controller.as_ref().unwrap().clone(), + ); + debugger.set_kv_statistics(self.kv_statistics.clone()); + debugger.set_raft_statistics(self.raft_statistics.clone()); + + // Debug service. + let debug_service = DebugService::new( + debugger, + servers.server.get_debug_thread_pool().clone(), + engines.engine.raft_extension(), + ); + if servers + .server + .register_service(create_debug(debug_service)) + .is_some() + { + fatal!("failed to register debug service"); + } + let cdc_service = cdc::Service::new( self.cdc_scheduler.as_ref().unwrap().clone(), self.cdc_memory_quota.as_ref().unwrap().clone(), diff --git a/components/sst_importer/Cargo.toml b/components/sst_importer/Cargo.toml index 8e2799b7437..cf48890291b 100644 --- a/components/sst_importer/Cargo.toml +++ b/components/sst_importer/Cargo.toml @@ -12,6 +12,10 @@ cloud-azure = ["external_storage_export/cloud-azure"] cloud-storage-grpc = ["external_storage_export/cloud-storage-grpc"] cloud-storage-dylib = ["external_storage_export/cloud-storage-dylib"] +test-engines-rocksdb = [ + "engine_test/test-engines-rocksdb", +] + [dependencies] api_version = { workspace = true } crc32fast = "1.2" @@ -32,6 +36,7 @@ log_wrappers = { workspace = true } online_config = { workspace = true } openssl = "0.10" prometheus = { version = "0.13", default-features = false } +protobuf = { version = "2.8", features = ["bytes"] } rand = "0.8" serde = "1.0" serde_derive = "1.0" @@ -45,6 +50,7 @@ txn_types = { workspace = true } uuid = { version = "0.8.1", features = ["serde", "v4"] } [dev-dependencies] +engine_test = { workspace = true } tempfile = "3.0" test_sst_importer = { workspace = true } test_util = { workspace = true } diff --git a/components/sst_importer/src/import_file.rs b/components/sst_importer/src/import_file.rs index 84d2f67bbab..a99c7c0f7e1 100644 --- a/components/sst_importer/src/import_file.rs +++ b/components/sst_importer/src/import_file.rs @@ -12,7 +12,8 @@ use api_version::api_v2::TIDB_RANGES_COMPLEMENT; use encryption::{DataKeyManager, EncrypterWriter}; use engine_rocks::{get_env, RocksSstReader}; use engine_traits::{ - iter_option, EncryptionKeyManager, Iterator, KvEngine, RefIterable, SstMetaInfo, SstReader, + iter_option, EncryptionKeyManager, IterOptions, Iterator, KvEngine, RefIterable, SstExt, + SstMetaInfo, SstReader, }; use file_system::{get_io_rate_limiter, sync_dir, File, OpenOptions}; use keys::data_key; @@ -166,11 +167,12 @@ impl ImportFile { fn cleanup(&mut self) -> Result<()> { self.file.take(); - if self.path.temp.exists() { + let path = &self.path.temp; + if path.exists() { if let Some(ref manager) = self.key_manager { - manager.delete_file(self.path.temp.to_str().unwrap())?; + manager.delete_file(path.to_str().unwrap())?; } - file_system::remove_file(&self.path.temp)?; + file_system::remove_file(path)?; } Ok(()) } @@ -413,6 +415,31 @@ impl ImportDir { Ok(()) } + pub fn load_start_key_by_meta( + &self, + meta: &SstMeta, + km: Option>, + ) -> Result>> { + let path = self.join(meta)?; + let r = match km { + Some(km) => E::SstReader::open_encrypted(&path.save.to_string_lossy(), km)?, + None => E::SstReader::open(&path.save.to_string_lossy())?, + }; + let opts = IterOptions::new(None, None, false); + let mut i = r.iter(opts)?; + if !i.seek_to_first()? || !i.valid()? { + return Ok(None); + } + // Should we warn if the key doesn't start with the prefix key? (Is that + // possible?) + // Also note this brings implicit coupling between this and + // RocksEngine. Perhaps it is better to make the engine to provide + // decode functions. Anyway we have directly used the RocksSstReader + // somewhere... This won't make things worse. + let real_key = i.key().strip_prefix(keys::DATA_PREFIX_KEY); + Ok(real_key.map(ToOwned::to_owned)) + } + pub fn list_ssts(&self) -> Result> { let mut ssts = Vec::new(); for e in file_system::read_dir(&self.root_dir)? { @@ -421,9 +448,9 @@ impl ImportDir { continue; } let path = e.path(); - match path_to_sst_meta(&path) { + match parse_meta_from_path(&path) { Ok(sst) => ssts.push(sst), - Err(e) => error!(%e; "path_to_sst_meta failed"; "path" => %path.to_str().unwrap(),), + Err(e) => error!(%e; "path_to_sst_meta failed"; "path" => %path.display(),), } } Ok(ssts) @@ -444,7 +471,7 @@ pub fn sst_meta_to_path(meta: &SstMeta) -> Result { ))) } -pub fn path_to_sst_meta>(path: P) -> Result { +pub fn parse_meta_from_path>(path: P) -> Result { let path = path.as_ref(); let file_name = match path.file_name().and_then(|n| n.to_str()) { Some(name) => name, @@ -496,7 +523,7 @@ mod test { let expected_path = format!("{}_1_2_3_default.sst", uuid); assert_eq!(path.to_str().unwrap(), &expected_path); - let new_meta = path_to_sst_meta(path).unwrap(); + let new_meta = parse_meta_from_path(path).unwrap(); assert_eq!(meta, new_meta); } @@ -516,7 +543,84 @@ mod test { meta.get_region_epoch().get_version(), SST_SUFFIX, )); - let new_meta = path_to_sst_meta(path).unwrap(); + let new_meta = parse_meta_from_path(path).unwrap(); assert_eq!(meta, new_meta); } + + #[cfg(feature = "test-engines-rocksdb")] + fn test_path_with_range_and_km(km: Option) { + use engine_rocks::{RocksEngine, RocksSstWriterBuilder}; + use engine_test::ctor::{CfOptions, DbOptions}; + use engine_traits::{SstWriter, SstWriterBuilder}; + use tempfile::TempDir; + let arcmgr = km.map(Arc::new); + let tmp = TempDir::new().unwrap(); + let dir = ImportDir::new(tmp.path()).unwrap(); + let mut meta = SstMeta::default(); + let mut rng = Range::new(); + rng.set_start(b"hello".to_vec()); + let uuid = Uuid::new_v4(); + meta.set_uuid(uuid.as_bytes().to_vec()); + meta.set_region_id(1); + meta.set_range(rng); + meta.mut_region_epoch().set_conf_ver(222); + meta.mut_region_epoch().set_version(333); + let mut db_opt = DbOptions::default(); + db_opt.set_key_manager(arcmgr.clone()); + let e = engine_test::kv::new_engine_opt( + &tmp.path().join("eng").to_string_lossy(), + db_opt, + vec![(CF_DEFAULT, CfOptions::new())], + ) + .unwrap(); + let f = dir.create(&meta, arcmgr.clone()).unwrap(); + let dp = f.path.clone(); + let mut w = RocksSstWriterBuilder::new() + .set_db(&e) + .set_cf(CF_DEFAULT) + .build(f.path.temp.to_str().unwrap()) + .unwrap(); + w.put(b"zhello", concat!("This is the start key of the SST, ", + "how about some of our users uploads metas with range not aligned with the content of SST?", + "No, at least for now, tidb-lightning won't do so.").as_bytes()).unwrap(); + w.put( + b"zworld", + concat!( + "This is the end key of the SST, ", + "you might notice that all keys have a extra prefix 'z', that was appended by the RocksEngine implementation.", + "It is a little weird that the user key isn't the same in SST. But anyway reasonable. We have bypassed some layers." + ) + .as_bytes(), + ) + .unwrap(); + w.finish().unwrap(); + dp.save(arcmgr.as_deref()).unwrap(); + let mut ssts = dir.list_ssts().unwrap(); + ssts.iter_mut().for_each(|meta| { + let start = dir + .load_start_key_by_meta::(meta, arcmgr.clone()) + .unwrap() + .unwrap(); + meta.mut_range().set_start(start) + }); + assert_eq!(ssts, vec![meta]); + } + + #[test] + #[cfg(feature = "test-engines-rocksdb")] + fn test_path_with_range() { + test_path_with_range_and_km(None) + } + + #[test] + #[cfg(feature = "test-engines-rocksdb")] + fn test_path_with_range_encrypted() { + use tempfile::TempDir; + use test_util::new_test_key_manager; + let dir = TempDir::new().unwrap(); + let enc = new_test_key_manager(&dir, None, None, None) + .unwrap() + .unwrap(); + test_path_with_range_and_km(Some(enc)); + } } diff --git a/components/sst_importer/src/sst_importer.rs b/components/sst_importer/src/sst_importer.rs index 907874c6928..90226668e5f 100644 --- a/components/sst_importer/src/sst_importer.rs +++ b/components/sst_importer/src/sst_importer.rs @@ -199,8 +199,10 @@ impl SstImporter { "size" => ?memory_limit, ); + let dir = ImportDir::new(root)?; + Ok(SstImporter { - dir: ImportDir::new(root)?, + dir, key_manager, switcher, api_version, @@ -1275,10 +1277,23 @@ impl SstImporter { } } + /// List the basic information of the current SST files. + /// The information contains UUID, region ID, region Epoch. + /// Other fields may be left blank. pub fn list_ssts(&self) -> Result> { self.dir.list_ssts() } + /// Load the start key by a metadata. + /// This will open the internal SST and try to load the first user key. + /// (For RocksEngine, that is the key without the 'z' prefix.) + /// When the SST is empty or the first key cannot be parsed as user key, + /// return None. + pub fn load_start_key_by_meta(&self, meta: &SstMeta) -> Result>> { + self.dir + .load_start_key_by_meta::(meta, self.key_manager.clone()) + } + pub fn new_txn_writer(&self, db: &E, meta: SstMeta) -> Result> { let mut default_meta = meta.clone(); default_meta.set_cf_name(CF_DEFAULT.to_owned()); diff --git a/components/test_raftstore-v2/src/cluster.rs b/components/test_raftstore-v2/src/cluster.rs index eafa7a45403..4c07054210d 100644 --- a/components/test_raftstore-v2/src/cluster.rs +++ b/components/test_raftstore-v2/src/cluster.rs @@ -17,15 +17,17 @@ use engine_traits::{ TabletRegistry, CF_DEFAULT, }; use file_system::IoRateLimiter; -use futures::{compat::Future01CompatExt, executor::block_on, select, Future, FutureExt}; +use futures::{ + compat::Future01CompatExt, executor::block_on, future::BoxFuture, select, Future, FutureExt, +}; use keys::{data_key, validate_data_key, DATA_PREFIX_KEY}; use kvproto::{ errorpb::Error as PbError, kvrpcpb::ApiVersion, metapb::{self, Buckets, PeerRole, RegionEpoch}, raft_cmdpb::{ - AdminCmdType, CmdType, RaftCmdRequest, RaftCmdResponse, RegionDetailResponse, Request, - Response, StatusCmdType, + AdminCmdType, AdminRequest, CmdType, RaftCmdRequest, RaftCmdResponse, RegionDetailResponse, + Request, Response, StatusCmdType, }, raft_serverpb::{ PeerState, RaftApplyState, RaftLocalState, RaftMessage, RaftTruncatedState, @@ -48,15 +50,17 @@ use resource_control::ResourceGroupManager; use tempfile::TempDir; use test_pd_client::TestPdClient; use test_raftstore::{ - is_error_response, new_admin_request, new_delete_cmd, new_delete_range_cmd, new_get_cf_cmd, - new_peer, new_prepare_merge, new_put_cf_cmd, new_region_detail_cmd, new_region_leader_cmd, - new_request, new_snap_cmd, new_status_request, new_store, new_tikv_config_with_api_ver, - new_transfer_leader_cmd, sleep_ms, Config, Filter, FilterFactory, PartitionFilterFactory, - RawEngine, + check_raft_cmd_request, is_error_response, new_admin_request, new_delete_cmd, + new_delete_range_cmd, new_get_cf_cmd, new_peer, new_prepare_merge, new_put_cf_cmd, + new_region_detail_cmd, new_region_leader_cmd, new_request, new_status_request, new_store, + new_tikv_config_with_api_ver, new_transfer_leader_cmd, sleep_ms, Config, Filter, FilterFactory, + PartitionFilterFactory, RawEngine, }; -use tikv::server::Result as ServerResult; +use tikv::{server::Result as ServerResult, storage::config::EngineType}; use tikv_util::{ - box_err, box_try, debug, error, safe_panic, + box_err, box_try, debug, error, + future::block_on_timeout, + safe_panic, thread_group::GroupProperties, time::{Instant, ThreadReadId}, timer::GLOBAL_TIMER_HANDLE, @@ -64,6 +68,7 @@ use tikv_util::{ worker::LazyWorker, HandyRwLock, }; +use txn_types::WriteBatchFlags; // We simulate 3 or 5 nodes, each has a store. // Sometimes, we use fixed id to test, which means the id @@ -101,12 +106,13 @@ pub trait Simulator { fn send_raft_msg(&mut self, msg: RaftMessage) -> Result<()>; fn read(&mut self, request: RaftCmdRequest, timeout: Duration) -> Result { + let node_id = request.get_header().get_peer().get_store_id(); let timeout_f = GLOBAL_TIMER_HANDLE .delay(std::time::Instant::now() + timeout) .compat(); futures::executor::block_on(async move { futures::select! { - res = self.async_read(request).fuse() => res, + res = self.async_read(node_id, request).fuse() => res, e = timeout_f.fuse() => { Err(Error::Timeout(format!("request timeout for {:?}: {:?}", timeout,e))) }, @@ -116,12 +122,13 @@ pub trait Simulator { fn async_read( &mut self, + node_id: u64, request: RaftCmdRequest, - ) -> impl Future> + Send { + ) -> impl Future> + Send + 'static { let mut req_clone = request.clone(); - req_clone.clear_requests(); - req_clone.mut_requests().push(new_snap_cmd()); - let snap = self.async_snapshot(req_clone); + // raftstore v2 only supports snap request. + req_clone.mut_requests()[0].set_cmd_type(CmdType::Snap); + let snap = self.async_snapshot(node_id, req_clone); async move { match snap.await { Ok(snap) => { @@ -174,15 +181,21 @@ pub trait Simulator { Ok(response) } - Err(e) => Ok(e), + Err(e) => { + error!("cluster.async_read fails"; "error" => ?e); + Ok(e) + } } } } fn async_snapshot( &mut self, + node_id: u64, request: RaftCmdRequest, - ) -> impl Future, RaftCmdResponse>> + Send; + ) -> impl Future, RaftCmdResponse>> + + Send + + 'static; fn async_peer_msg_on_node(&self, node_id: u64, region_id: u64, msg: PeerMsg) -> Result<()>; @@ -198,7 +211,7 @@ pub trait Simulator { timeout: Duration, ) -> Result { let region_id = request.get_header().get_region_id(); - let (msg, sub) = PeerMsg::raft_query(request); + let (msg, sub) = PeerMsg::raft_query(request.clone()); match self.async_peer_msg_on_node(node_id, region_id, msg) { Ok(()) => {} Err(e) => { @@ -208,17 +221,17 @@ pub trait Simulator { } } - let timeout_f = GLOBAL_TIMER_HANDLE.delay(std::time::Instant::now() + timeout); - // todo: unwrap? - match block_on(async move { - select! { - res = sub.result().fuse() => Ok(res.unwrap()), - _ = timeout_f.compat().fuse() => Err(Error::Timeout(format!("request timeout for {:?}", timeout))), - + let mut fut = Box::pin(sub.result()); + match block_on_timeout(fut.as_mut(), timeout) + .map_err(|e| Error::Timeout(format!("request timeout for {:?}: {:?}", timeout, e)))? + { + Some(QueryResult::Read(_)) => unreachable!(), + Some(QueryResult::Response(resp)) => Ok(resp), + None => { + error!("call_query_on_node receives none response"; "request" => ?request); + // Do not unwrap here, sometimes raftstore v2 may return none. + return Err(box_err!("receives none response {:?}", request)); } - }).unwrap() { - QueryResult::Read(_) => unreachable!(), - QueryResult::Response(resp) => Ok(resp), } } @@ -251,7 +264,13 @@ pub trait Simulator { write_encoder.delete(delete.get_cf(), delete.get_key()); } CmdType::DeleteRange => { - unimplemented!() + let delete_range = req.get_delete_range(); + write_encoder.delete_range( + delete_range.get_cf(), + delete_range.get_start_key(), + delete_range.get_end_key(), + delete_range.get_notify_only(), + ); } _ => unreachable!(), } @@ -278,10 +297,20 @@ pub trait Simulator { }) } - fn async_command_on_node(&self, node_id: u64, mut request: RaftCmdRequest) { + fn async_command_on_node( + &mut self, + node_id: u64, + mut request: RaftCmdRequest, + ) -> BoxFuture<'static, RaftCmdResponse> { let region_id = request.get_header().get_region_id(); - let (msg, _sub) = if request.has_admin_request() { + let is_read = check_raft_cmd_request(&request); + if is_read { + let fut = self.async_read(node_id, request); + return Box::pin(async move { fut.await.unwrap() }); + } + + let (msg, sub) = if request.has_admin_request() { PeerMsg::admin_command(request) } else { let requests = request.get_requests(); @@ -307,6 +336,7 @@ pub trait Simulator { self.async_peer_msg_on_node(node_id, region_id, msg) .unwrap(); + Box::pin(async move { sub.result().await.unwrap() }) } } @@ -371,9 +401,11 @@ impl, EK: KvEngine> Cluster { ), >, ) -> Cluster { + let mut tikv_cfg = new_tikv_config_with_api_ver(id, api_version); + tikv_cfg.storage.engine = EngineType::RaftKv2; Cluster { cfg: Config { - tikv: new_tikv_config_with_api_ver(id, api_version), + tikv: tikv_cfg, prefer_mem: true, }, count, @@ -686,7 +718,7 @@ impl, EK: KvEngine> Cluster { // mixed read and write requests are not supportted pub fn call_command( - &mut self, + &self, request: RaftCmdRequest, timeout: Duration, ) -> Result { @@ -839,7 +871,7 @@ impl, EK: KvEngine> Cluster { } pub fn query_leader( - &mut self, + &self, store_id: u64, region_id: u64, timeout: Duration, @@ -1480,8 +1512,9 @@ impl, EK: KvEngine> Cluster { let mut req = self.new_prepare_merge(source, target); let leader = self.leader_of_region(source).unwrap(); req.mut_header().set_peer(leader.clone()); - self.sim - .rl() + let _ = self + .sim + .wl() .async_command_on_node(leader.get_store_id(), req); } @@ -1641,6 +1674,48 @@ impl, EK: KvEngine> Cluster { debug!("all nodes are shut down."); } + + pub fn must_send_flashback_msg( + &mut self, + region_id: u64, + cmd_type: AdminCmdType, + ) -> BoxFuture<'static, RaftCmdResponse> { + let leader = self.leader_of_region(region_id).unwrap(); + let store_id = leader.get_store_id(); + let region_epoch = self.get_region_epoch(region_id); + let mut admin = AdminRequest::default(); + admin.set_cmd_type(cmd_type); + let mut req = RaftCmdRequest::default(); + req.mut_header().set_region_id(region_id); + req.mut_header().set_region_epoch(region_epoch); + req.mut_header().set_peer(leader); + req.set_admin_request(admin); + req.mut_header() + .set_flags(WriteBatchFlags::FLASHBACK.bits()); + let (msg, sub) = PeerMsg::admin_command(req); + let router = self.sim.rl().get_router(store_id).unwrap(); + if let Err(e) = router.send(region_id, msg) { + panic!( + "router send flashback msg {:?} failed, error: {}", + cmd_type, e + ); + } + Box::pin(async move { sub.result().await.unwrap() }) + } + + pub fn must_send_wait_flashback_msg(&mut self, region_id: u64, cmd_type: AdminCmdType) { + let resp = self.must_send_flashback_msg(region_id, cmd_type); + block_on(async { + let resp = resp.await; + if resp.get_header().has_error() { + panic!( + "call flashback msg {:?} failed, error: {:?}", + cmd_type, + resp.get_header().get_error() + ); + } + }); + } } pub fn bootstrap_store( diff --git a/components/test_raftstore-v2/src/node.rs b/components/test_raftstore-v2/src/node.rs index ffa38b51796..1674d860ccc 100644 --- a/components/test_raftstore-v2/src/node.rs +++ b/components/test_raftstore-v2/src/node.rs @@ -164,6 +164,10 @@ impl NodeCluster { snap_mgrs: HashMap::default(), } } + + pub fn get_concurrency_manager(&self, node_id: u64) -> ConcurrencyManager { + self.concurrency_managers.get(&node_id).unwrap().clone() + } } impl Simulator for NodeCluster { @@ -200,6 +204,7 @@ impl Simulator for NodeCluster { let simulate_trans = SimulateTransport::new(self.trans.clone()); let mut raft_store = cfg.raft_store.clone(); + raft_store.optimize_for(true); raft_store .validate( cfg.coprocessor.region_split_size(), @@ -319,6 +324,7 @@ impl Simulator for NodeCluster { let enable_region_bucket = cfg.coprocessor.enable_region_bucket(); let region_bucket_size = cfg.coprocessor.region_bucket_size; let mut raftstore_cfg = cfg.tikv.raft_store; + raftstore_cfg.optimize_for(true); raftstore_cfg .validate(region_split_size, enable_region_bucket, region_bucket_size) .unwrap(); @@ -355,10 +361,11 @@ impl Simulator for NodeCluster { fn async_snapshot( &mut self, + node_id: u64, request: RaftCmdRequest, - ) -> impl Future, RaftCmdResponse>> + Send - { - let node_id = request.get_header().get_peer().get_store_id(); + ) -> impl Future, RaftCmdResponse>> + + Send + + 'static { if !self .trans .core diff --git a/components/test_raftstore-v2/src/server.rs b/components/test_raftstore-v2/src/server.rs index 86df27c0e52..30da5a4fc8f 100644 --- a/components/test_raftstore-v2/src/server.rs +++ b/components/test_raftstore-v2/src/server.rs @@ -15,12 +15,12 @@ use encryption_export::DataKeyManager; use engine_rocks::RocksEngine; use engine_test::raft::RaftTestEngine; use engine_traits::{KvEngine, RaftEngine, TabletRegistry}; -use futures::{executor::block_on, Future}; +use futures::{executor::block_on, future::BoxFuture, Future}; use grpcio::{ChannelBuilder, EnvBuilder, Environment, Error as GrpcError, Service}; use grpcio_health::HealthService; use kvproto::{ deadlock_grpc::create_deadlock, - debugpb_grpc::DebugClient, + debugpb_grpc::{create_debug, DebugClient}, diagnosticspb_grpc::create_diagnostics, import_sstpb_grpc::create_import_sst, kvrpcpb::{ApiVersion, Context}, @@ -48,14 +48,20 @@ use tempfile::TempDir; use test_pd_client::TestPdClient; use test_raftstore::{filter_send, AddressMap, Config, Filter}; use tikv::{ + config::ConfigController, coprocessor, coprocessor_v2, import::{ImportSstService, SstImporter}, read_pool::ReadPool, server::{ - gc_worker::GcWorker, load_statistics::ThreadLoadPool, lock_manager::LockManager, - raftkv::ReplicaReadLockChecker, resolve, service::DiagnosticsService, ConnectionBuilder, - Error, Extension, NodeV2, PdStoreAddrResolver, RaftClient, RaftKv2, Result as ServerResult, - Server, ServerTransport, + debug2::DebuggerImplV2, + gc_worker::GcWorker, + load_statistics::ThreadLoadPool, + lock_manager::LockManager, + raftkv::ReplicaReadLockChecker, + resolve, + service::{DebugService, DiagnosticsService}, + ConnectionBuilder, Error, Extension, NodeV2, PdStoreAddrResolver, RaftClient, RaftKv2, + Result as ServerResult, Server, ServerTransport, }, storage::{ self, @@ -73,7 +79,7 @@ use tikv_util::{ worker::{Builder as WorkerBuilder, LazyWorker}, Either, HandyRwLock, }; -use tokio::runtime::Builder as TokioBuilder; +use tokio::runtime::{Builder as TokioBuilder, Handle}; use txn_types::TxnExtraScheduler; use crate::{Cluster, RaftStoreRouter, SimulateTransport, Simulator, SnapshotRouter}; @@ -159,6 +165,18 @@ impl Engine for TestRaftKv2 { fn schedule_txn_extra(&self, txn_extra: txn_types::TxnExtra) { self.raftkv.schedule_txn_extra(txn_extra) } + + fn start_flashback( + &self, + ctx: &Context, + start_ts: u64, + ) -> BoxFuture<'static, storage::kv::Result<()>> { + self.raftkv.start_flashback(ctx, start_ts) + } + + fn end_flashback(&self, ctx: &Context) -> BoxFuture<'static, storage::kv::Result<()>> { + self.raftkv.end_flashback(ctx) + } } #[derive(Clone)] @@ -251,6 +269,7 @@ pub struct ServerMeta { } type PendingServices = Vec Service>>; +type PendingDebugService = Box, Handle) -> Service>; pub struct ServerCluster { metas: HashMap>, @@ -260,10 +279,14 @@ pub struct ServerCluster { snap_paths: HashMap, snap_mgrs: HashMap, pd_client: Arc, - raft_client: RaftClient, + raft_clients: HashMap>, + conn_builder: ConnectionBuilder, concurrency_managers: HashMap, env: Arc, pub pending_services: HashMap, + // This is used to work around that server cluster is generic over KvEngine while the debug + // service implementation is specific overal RocksDB. + pub pending_debug_service: Option>, pub health_services: HashMap, pub security_mgr: Arc, pub txn_extra_schedulers: HashMap>, @@ -292,7 +315,6 @@ impl ServerCluster { worker.scheduler(), Arc::new(ThreadLoadPool::with_threshold(usize::MAX)), ); - let raft_client = RaftClient::new(conn_builder); ServerCluster { metas: HashMap::default(), addrs: map, @@ -303,8 +325,10 @@ impl ServerCluster { snap_mgrs: HashMap::default(), snap_paths: HashMap::default(), pending_services: HashMap::default(), + pending_debug_service: None::>, health_services: HashMap::default(), - raft_client, + raft_clients: HashMap::default(), + conn_builder, concurrency_managers: HashMap::default(), env, txn_extra_schedulers: HashMap::default(), @@ -351,6 +375,7 @@ impl ServerCluster { // Create node. let mut raft_store = cfg.raft_store.clone(); + raft_store.optimize_for(true); raft_store .validate( cfg.coprocessor.region_split_size(), @@ -555,7 +580,7 @@ impl ServerCluster { ); let debug_thread_handle = debug_thread_pool.handle().clone(); let diag_service = DiagnosticsService::new( - debug_thread_handle, + debug_thread_handle.clone(), cfg.log.file.filename.clone(), cfg.slow_log_file.clone(), ); @@ -589,6 +614,9 @@ impl ServerCluster { svr.register_service(fact()); } } + if let Some(debug_service) = &self.pending_debug_service { + svr.register_service(debug_service(self, debug_thread_handle.clone())); + } match svr.build_and_bind() { Ok(_) => { server = Some(svr); @@ -675,6 +703,8 @@ impl ServerCluster { self.concurrency_managers .insert(node_id, concurrency_manager); + let client = RaftClient::new(node_id, self.conn_builder.clone()); + self.raft_clients.insert(node_id, client); Ok(node_id) } @@ -788,14 +818,16 @@ impl Simulator for ServerCluster { (meta.rsmeter_cleanup)(); } self.storages.remove(&node_id); + let _ = self.raft_clients.remove(&node_id); } fn async_snapshot( &mut self, + node_id: u64, request: kvproto::raft_cmdpb::RaftCmdRequest, - ) -> impl Future, RaftCmdResponse>> + Send - { - let node_id = request.get_header().get_peer().get_store_id(); + ) -> impl Future, RaftCmdResponse>> + + Send + + 'static { let mut router = match self.metas.get(&node_id) { None => { let mut resp = RaftCmdResponse::default(); @@ -825,8 +857,12 @@ impl Simulator for ServerCluster { } fn send_raft_msg(&mut self, msg: RaftMessage) -> raftstore::Result<()> { - self.raft_client.send(msg).unwrap(); - self.raft_client.flush(); + let from_store = msg.get_from_peer().store_id; + assert_ne!(from_store, 0); + if let Some(client) = self.raft_clients.get_mut(&from_store) { + client.send(msg).unwrap(); + client.flush(); + } Ok(()) } @@ -1020,7 +1056,30 @@ pub fn must_new_cluster_and_debug_client() -> ( DebugClient, u64, ) { - let (cluster, leader, _) = must_new_cluster_mul(1); + let mut cluster = new_server_cluster(0, 1); + cluster.create_engines(); + let region_id = cluster.bootstrap_conf_change(); + + { + let mut sim = cluster.sim.wl(); + let tablet_registry = cluster.tablet_registries.get(&1).unwrap().clone(); + let raft_engine = cluster.raft_engines.get(&1).unwrap().clone(); + let debugger = + DebuggerImplV2::new(tablet_registry, raft_engine, ConfigController::default()); + + sim.pending_debug_service = Some(Box::new(move |cluster, debug_thread_handle| { + let raft_extension = cluster.storages.get(&1).unwrap().raft_extension(); + + create_debug(DebugService::new( + debugger.clone(), + debug_thread_handle, + raft_extension, + )) + })); + } + + cluster.start().unwrap(); + let leader = cluster.leader_of_region(region_id).unwrap(); let env = Arc::new(Environment::new(1)); let channel = @@ -1029,3 +1088,46 @@ pub fn must_new_cluster_and_debug_client() -> ( (cluster, client, leader.get_store_id()) } + +pub fn setup_cluster() -> ( + Cluster, RocksEngine>, + TikvClient, + String, + Context, +) { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + + let region_id = 1; + let leader = cluster.leader_of_region(region_id).unwrap(); + let leader_addr = cluster.sim.rl().get_addr(leader.get_store_id()); + let region = cluster.get_region(b"k1"); + let follower = region + .get_peers() + .iter() + .find(|p| **p != leader) + .unwrap() + .clone(); + let follower_addr = cluster.sim.rl().get_addr(follower.get_store_id()); + let epoch = cluster.get_region_epoch(region_id); + let mut ctx = Context::default(); + ctx.set_region_id(region_id); + ctx.set_peer(leader); + ctx.set_region_epoch(epoch); + + let env = Arc::new(Environment::new(1)); + let channel = ChannelBuilder::new(env).connect(&follower_addr); + let client = TikvClient::new(channel); + + // Verify not setting forwarding header will result in store not match. + let mut put_req = kvproto::kvrpcpb::RawPutRequest::default(); + put_req.set_context(ctx.clone()); + let put_resp = client.raw_put(&put_req).unwrap(); + assert!( + put_resp.get_region_error().has_store_not_match(), + "{:?}", + put_resp + ); + assert!(put_resp.error.is_empty(), "{:?}", put_resp); + (cluster, client, leader_addr, ctx) +} diff --git a/components/test_raftstore-v2/src/util.rs b/components/test_raftstore-v2/src/util.rs index b9e6464c5d8..5642c7279b1 100644 --- a/components/test_raftstore-v2/src/util.rs +++ b/components/test_raftstore-v2/src/util.rs @@ -5,12 +5,16 @@ use std::{fmt::Write, sync::Arc, thread, time::Duration}; use encryption_export::{data_key_manager_from_config, DataKeyManager}; use engine_rocks::{RocksEngine, RocksStatistics}; use engine_test::raft::RaftTestEngine; -use engine_traits::{KvEngine, TabletRegistry, CF_DEFAULT}; +use engine_traits::{CfName, KvEngine, TabletRegistry, CF_DEFAULT}; use file_system::IoRateLimiter; -use futures::Future; -use kvproto::{kvrpcpb::Context, metapb, raft_cmdpb::RaftCmdResponse}; +use futures::future::BoxFuture; +use kvproto::{ + kvrpcpb::Context, + metapb, + raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}, +}; use raftstore::Result; -use rand::RngCore; +use rand::{prelude::SliceRandom, RngCore}; use server::common::ConfiguredRaftEngine; use tempfile::TempDir; use test_raftstore::{new_get_cmd, new_put_cf_cmd, new_request, Config}; @@ -19,10 +23,11 @@ use tikv::{ storage::{ config::EngineType, kv::{SnapContext, SnapshotExt}, - Engine, Snapshot, + point_key_range, Engine, Snapshot, }, }; use tikv_util::{config::ReadableDuration, worker::LazyWorker, HandyRwLock}; +use txn_types::Key; use crate::{bootstrap_store, cluster::Cluster, ServerCluster, Simulator}; @@ -222,7 +227,7 @@ pub fn async_read_on_peer, EK: KvEngine>( key: &[u8], read_quorum: bool, replica_read: bool, -) -> impl Future> { +) -> BoxFuture<'static, RaftCmdResponse> { let mut request = new_request( region.get_id(), region.get_region_epoch().clone(), @@ -231,5 +236,74 @@ pub fn async_read_on_peer, EK: KvEngine>( ); request.mut_header().set_peer(peer); request.mut_header().set_replica_read(replica_read); - cluster.sim.wl().async_read(request) + let node_id = request.get_header().get_peer().get_store_id(); + let f = cluster.sim.wl().async_read(node_id, request); + Box::pin(async move { f.await.unwrap() }) +} + +pub fn async_read_index_on_peer, EK: KvEngine>( + cluster: &mut Cluster, + peer: metapb::Peer, + region: metapb::Region, + key: &[u8], + read_quorum: bool, +) -> BoxFuture<'static, RaftCmdResponse> { + let mut cmd = new_get_cmd(key); + cmd.mut_read_index().set_start_ts(u64::MAX); + cmd.mut_read_index() + .mut_key_ranges() + .push(point_key_range(Key::from_raw(key))); + let mut request = new_request( + region.get_id(), + region.get_region_epoch().clone(), + vec![cmd], + read_quorum, + ); + // Use replica read to issue a read index. + request.mut_header().set_replica_read(true); + request.mut_header().set_peer(peer); + let node_id = request.get_header().get_peer().get_store_id(); + let f = cluster.sim.wl().async_read(node_id, request); + Box::pin(async move { f.await.unwrap() }) +} + +pub fn async_command_on_node, EK: KvEngine>( + cluster: &mut Cluster, + node_id: u64, + request: RaftCmdRequest, +) -> BoxFuture<'static, RaftCmdResponse> { + cluster.sim.wl().async_command_on_node(node_id, request) +} + +pub fn test_delete_range, EK: KvEngine>(cluster: &mut Cluster, cf: CfName) { + let data_set: Vec<_> = (1..500) + .map(|i| { + ( + format!("key{:08}", i).into_bytes(), + format!("value{}", i).into_bytes(), + ) + }) + .collect(); + for kvs in data_set.chunks(50) { + let requests = kvs.iter().map(|(k, v)| new_put_cf_cmd(cf, k, v)).collect(); + // key9 is always the last region. + cluster.batch_put(b"key9", requests).unwrap(); + } + + // delete_range request with notify_only set should not actually delete data. + cluster.must_notify_delete_range_cf(cf, b"", b""); + + let mut rng = rand::thread_rng(); + for _ in 0..50 { + let (k, v) = data_set.choose(&mut rng).unwrap(); + assert_eq!(cluster.get_cf(cf, k).unwrap(), *v); + } + + // Empty keys means the whole range. + cluster.must_delete_range_cf(cf, b"", b""); + + for _ in 0..50 { + let k = &data_set.choose(&mut rng).unwrap().0; + assert!(cluster.get_cf(cf, k).is_none()); + } } diff --git a/components/test_raftstore/src/cluster.rs b/components/test_raftstore/src/cluster.rs index 988625d3750..c916ec7448e 100644 --- a/components/test_raftstore/src/cluster.rs +++ b/components/test_raftstore/src/cluster.rs @@ -19,7 +19,7 @@ use engine_traits::{ WriteBatch, WriteBatchExt, CF_DEFAULT, CF_RAFT, }; use file_system::IoRateLimiter; -use futures::{self, channel::oneshot, executor::block_on}; +use futures::{self, channel::oneshot, executor::block_on, future::BoxFuture}; use kvproto::{ errorpb::Error as PbError, kvrpcpb::{ApiVersion, Context, DiskFullOpt}, @@ -51,6 +51,7 @@ use tempfile::TempDir; use test_pd_client::TestPdClient; use tikv::server::Result as ServerResult; use tikv_util::{ + mpsc::future, thread_group::GroupProperties, time::{Instant, ThreadReadId}, worker::LazyWorker, @@ -121,7 +122,7 @@ pub trait Simulator { timeout: Duration, ) -> Result { let node_id = request.get_header().get_peer().get_store_id(); - let (cb, rx) = make_cb(&request); + let (cb, mut rx) = make_cb(&request); self.async_read(node_id, batch_id, request, cb); rx.recv_timeout(timeout) .map_err(|_| Error::Timeout(format!("request timeout for {:?}", timeout))) @@ -141,7 +142,7 @@ pub trait Simulator { request: RaftCmdRequest, timeout: Duration, ) -> Result { - let (cb, rx) = make_cb(&request); + let (cb, mut rx) = make_cb(&request); match self.async_command_on_node(node_id, request, cb) { Ok(()) => {} @@ -968,7 +969,7 @@ impl Cluster { pub fn async_request( &mut self, req: RaftCmdRequest, - ) -> Result> { + ) -> Result> { self.async_request_with_opts(req, Default::default()) } @@ -976,7 +977,7 @@ impl Cluster { &mut self, mut req: RaftCmdRequest, opts: RaftCmdExtraOpts, - ) -> Result> { + ) -> Result> { let region_id = req.get_header().get_region_id(); let leader = self.leader_of_region(region_id).unwrap(); req.mut_header().set_peer(leader.clone()); @@ -987,7 +988,10 @@ impl Cluster { Ok(rx) } - pub fn async_exit_joint(&mut self, region_id: u64) -> Result> { + pub fn async_exit_joint( + &mut self, + region_id: u64, + ) -> Result> { let region = block_on(self.pd_client.get_region_by_id(region_id)) .unwrap() .unwrap(); @@ -1003,7 +1007,7 @@ impl Cluster { &mut self, key: &[u8], value: &[u8], - ) -> Result> { + ) -> Result> { let mut region = self.get_region(key); let reqs = vec![new_put_cmd(key, value)]; let put = new_request(region.get_id(), region.take_region_epoch(), reqs, false); @@ -1014,7 +1018,7 @@ impl Cluster { &mut self, region_id: u64, peer: metapb::Peer, - ) -> Result> { + ) -> Result> { let region = block_on(self.pd_client.get_region_by_id(region_id)) .unwrap() .unwrap(); @@ -1027,7 +1031,7 @@ impl Cluster { &mut self, region_id: u64, peer: metapb::Peer, - ) -> Result> { + ) -> Result> { let region = block_on(self.pd_client.get_region_by_id(region_id)) .unwrap() .unwrap(); @@ -1484,8 +1488,7 @@ impl Cluster { &mut self, region_id: u64, cmd_type: AdminCmdType, - cb: Callback, - ) { + ) -> BoxFuture<'static, RaftCmdResponse> { let leader = self.leader_of_region(region_id).unwrap(); let store_id = leader.get_store_id(); let region_epoch = self.get_region_epoch(region_id); @@ -1498,10 +1501,13 @@ impl Cluster { req.set_admin_request(admin); req.mut_header() .set_flags(WriteBatchFlags::FLASHBACK.bits()); + let (result_tx, result_rx) = oneshot::channel(); let router = self.sim.rl().get_router(store_id).unwrap(); if let Err(e) = router.send_command( req, - cb, + Callback::write(Box::new(move |resp| { + result_tx.send(resp.response).unwrap(); + })), RaftCmdExtraOpts { deadline: None, disk_full_opt: DiskFullOpt::AllowedOnAlmostFull, @@ -1512,27 +1518,22 @@ impl Cluster { cmd_type, e ); } + Box::pin(async move { result_rx.await.unwrap() }) } pub fn must_send_wait_flashback_msg(&mut self, region_id: u64, cmd_type: AdminCmdType) { self.wait_applied_to_current_term(region_id, Duration::from_secs(3)); - let (result_tx, result_rx) = oneshot::channel(); - self.must_send_flashback_msg( - region_id, - cmd_type, - Callback::write(Box::new(move |resp| { - if resp.response.get_header().has_error() { - result_tx - .send(Some(resp.response.get_header().get_error().clone())) - .unwrap(); - return; - } - result_tx.send(None).unwrap(); - })), - ); - if let Some(e) = block_on(result_rx).unwrap() { - panic!("call flashback msg {:?} failed, error: {:?}", cmd_type, e); - } + let resp = self.must_send_flashback_msg(region_id, cmd_type); + block_on(async { + let resp = resp.await; + if resp.get_header().has_error() { + panic!( + "call flashback msg {:?} failed, error: {:?}", + cmd_type, + resp.get_header().get_error() + ); + } + }); } pub fn wait_applied_to_current_term(&mut self, region_id: u64, timeout: Duration) { diff --git a/components/test_raftstore/src/node.rs b/components/test_raftstore/src/node.rs index 75ab0064a17..3f0168fa361 100644 --- a/components/test_raftstore/src/node.rs +++ b/components/test_raftstore/src/node.rs @@ -237,6 +237,7 @@ impl Simulator for NodeCluster { let simulate_trans = SimulateTransport::new(self.trans.clone()); let mut raft_store = cfg.raft_store.clone(); + raft_store.optimize_for(false); raft_store .validate( cfg.coprocessor.region_split_size(), @@ -352,6 +353,7 @@ impl Simulator for NodeCluster { let enable_region_bucket = cfg.coprocessor.enable_region_bucket(); let region_bucket_size = cfg.coprocessor.region_bucket_size; let mut raftstore_cfg = cfg.tikv.raft_store; + raftstore_cfg.optimize_for(false); raftstore_cfg .validate(region_split_size, enable_region_bucket, region_bucket_size) .unwrap(); diff --git a/components/test_raftstore/src/server.rs b/components/test_raftstore/src/server.rs index a59dafd4504..1dcf63635a2 100644 --- a/components/test_raftstore/src/server.rs +++ b/components/test_raftstore/src/server.rs @@ -53,6 +53,7 @@ use tikv::{ import::{ImportSstService, SstImporter}, read_pool::ReadPool, server::{ + debug::DebuggerImpl, gc_worker::GcWorker, load_statistics::ThreadLoadPool, lock_manager::LockManager, @@ -154,7 +155,8 @@ pub struct ServerCluster { snap_paths: HashMap, snap_mgrs: HashMap, pd_client: Arc, - raft_client: RaftClient, + raft_clients: HashMap>, + conn_builder: ConnectionBuilder, concurrency_managers: HashMap, env: Arc, pub causal_ts_providers: HashMap>, @@ -182,7 +184,6 @@ impl ServerCluster { worker.scheduler(), Arc::new(ThreadLoadPool::with_threshold(usize::MAX)), ); - let raft_client = RaftClient::new(conn_builder); ServerCluster { metas: HashMap::default(), addrs: map, @@ -196,7 +197,8 @@ impl ServerCluster { pending_services: HashMap::default(), coprocessor_hooks: HashMap::default(), health_services: HashMap::default(), - raft_client, + raft_clients: HashMap::default(), + conn_builder, concurrency_managers: HashMap::default(), env, txn_extra_schedulers: HashMap::default(), @@ -486,19 +488,15 @@ impl ServerCluster { .build() .unwrap(), ); + + let debugger = DebuggerImpl::new(engines.clone(), ConfigController::default()); let debug_thread_handle = debug_thread_pool.handle().clone(); - let debug_service = DebugService::new( - engines.clone(), - None, - None, - debug_thread_handle, - extension, - ConfigController::default(), - ); + let debug_service = DebugService::new(debugger, debug_thread_handle, extension); let apply_router = system.apply_router(); // Create node. let mut raft_store = cfg.raft_store.clone(); + raft_store.optimize_for(false); raft_store .validate( cfg.coprocessor.region_split_size(), @@ -645,6 +643,8 @@ impl ServerCluster { self.concurrency_managers .insert(node_id, concurrency_manager); + let client = RaftClient::new(node_id, self.conn_builder.clone()); + self.raft_clients.insert(node_id, client); Ok(node_id) } } @@ -698,6 +698,7 @@ impl Simulator for ServerCluster { } (meta.rsmeter_cleanup)(); } + let _ = self.raft_clients.remove(&node_id); } fn get_node_ids(&self) -> HashSet { @@ -739,8 +740,12 @@ impl Simulator for ServerCluster { } fn send_raft_msg(&mut self, raft_msg: raft_serverpb::RaftMessage) -> Result<()> { - self.raft_client.send(raft_msg).unwrap(); - self.raft_client.flush(); + let from_store = raft_msg.get_from_peer().store_id; + assert_ne!(from_store, 0); + if let Some(client) = self.raft_clients.get_mut(&from_store) { + client.send(raft_msg).unwrap(); + client.flush(); + } Ok(()) } @@ -920,3 +925,41 @@ pub fn must_new_and_configure_cluster_and_kv_client( (cluster, client, ctx) } + +pub fn setup_cluster() -> (Cluster, TikvClient, String, Context) { + let mut cluster = new_server_cluster(0, 3); + cluster.run(); + + let region_id = 1; + let leader = cluster.leader_of_region(region_id).unwrap(); + let leader_addr = cluster.sim.rl().get_addr(leader.get_store_id()); + let region = cluster.get_region(b"k1"); + let follower = region + .get_peers() + .iter() + .find(|p| **p != leader) + .unwrap() + .clone(); + let follower_addr = cluster.sim.rl().get_addr(follower.get_store_id()); + let epoch = cluster.get_region_epoch(region_id); + let mut ctx = Context::default(); + ctx.set_region_id(region_id); + ctx.set_peer(leader); + ctx.set_region_epoch(epoch); + + let env = Arc::new(Environment::new(1)); + let channel = ChannelBuilder::new(env).connect(&follower_addr); + let client = TikvClient::new(channel); + + // Verify not setting forwarding header will result in store not match. + let mut put_req = kvproto::kvrpcpb::RawPutRequest::default(); + put_req.set_context(ctx.clone()); + let put_resp = client.raw_put(&put_req).unwrap(); + assert!( + put_resp.get_region_error().has_store_not_match(), + "{:?}", + put_resp + ); + assert!(put_resp.error.is_empty(), "{:?}", put_resp); + (cluster, client, leader_addr, ctx) +} diff --git a/components/test_raftstore/src/util.rs b/components/test_raftstore/src/util.rs index cdfe5c8f475..079e3abf1ef 100644 --- a/components/test_raftstore/src/util.rs +++ b/components/test_raftstore/src/util.rs @@ -16,11 +16,11 @@ use encryption_export::{ use engine_rocks::{config::BlobRunMode, RocksEngine, RocksSnapshot, RocksStatistics}; use engine_test::raft::RaftTestEngine; use engine_traits::{ - CfNamesExt, Engines, Iterable, KvEngine, Peekable, RaftEngineDebug, RaftEngineReadOnly, + CfName, CfNamesExt, Engines, Iterable, KvEngine, Peekable, RaftEngineDebug, RaftEngineReadOnly, CF_DEFAULT, CF_RAFT, }; use file_system::IoRateLimiter; -use futures::executor::block_on; +use futures::{executor::block_on, future::BoxFuture, StreamExt}; use grpcio::{ChannelBuilder, Environment}; use kvproto::{ encryptionpb::EncryptionMethod, @@ -42,7 +42,7 @@ use raftstore::{ store::{fsm::RaftRouter, *}, RaftRouterCompactedEventSender, Result, }; -use rand::RngCore; +use rand::{seq::SliceRandom, RngCore}; use server::common::ConfiguredRaftEngine; use tempfile::TempDir; use test_pd_client::TestPdClient; @@ -55,7 +55,9 @@ use tikv::{ }, }; pub use tikv_util::store::{find_peer, new_learner_peer, new_peer}; -use tikv_util::{config::*, escape, time::ThreadReadId, worker::LazyWorker, HandyRwLock}; +use tikv_util::{ + config::*, escape, mpsc::future, time::ThreadReadId, worker::LazyWorker, HandyRwLock, +}; use txn_types::Key; use crate::{Cluster, Config, RawEngine, ServerCluster, Simulator}; @@ -364,7 +366,7 @@ impl Drop for CallbackLeakDetector { } } -pub fn make_cb(cmd: &RaftCmdRequest) -> (Callback, mpsc::Receiver) { +pub fn check_raft_cmd_request(cmd: &RaftCmdRequest) -> bool { let mut is_read = cmd.has_status_request(); let mut is_write = cmd.has_admin_request(); for req in cmd.get_requests() { @@ -377,8 +379,14 @@ pub fn make_cb(cmd: &RaftCmdRequest) -> (Callback, mpsc::Receiver } } assert!(is_read ^ is_write, "Invalid RaftCmdRequest: {:?}", cmd); + is_read +} - let (tx, rx) = mpsc::channel(); +pub fn make_cb( + cmd: &RaftCmdRequest, +) -> (Callback, future::Receiver) { + let is_read = check_raft_cmd_request(cmd); + let (tx, rx) = future::bounded(1, future::WakePolicy::Immediately); let mut detector = CallbackLeakDetector::default(); let cb = if is_read { Callback::read(Box::new(move |resp: ReadResponse| { @@ -400,7 +408,7 @@ pub fn make_cb_ext( cmd: &RaftCmdRequest, proposed: Option, committed: Option, -) -> (Callback, mpsc::Receiver) { +) -> (Callback, future::Receiver) { let (cb, receiver) = make_cb(cmd); if let Callback::Write { cb, .. } = cb { (Callback::write_ext(cb, proposed, committed), receiver) @@ -435,7 +443,7 @@ pub fn async_read_on_peer( key: &[u8], read_quorum: bool, replica_read: bool, -) -> mpsc::Receiver { +) -> BoxFuture<'static, RaftCmdResponse> { let node_id = peer.get_store_id(); let mut request = new_request( region.get_id(), @@ -445,10 +453,13 @@ pub fn async_read_on_peer( ); request.mut_header().set_peer(peer); request.mut_header().set_replica_read(replica_read); - let (tx, rx) = mpsc::sync_channel(1); + let (tx, mut rx) = future::bounded(1, future::WakePolicy::Immediately); let cb = Callback::read(Box::new(move |resp| drop(tx.send(resp.response)))); cluster.sim.wl().async_read(node_id, None, request, cb); - rx + Box::pin(async move { + let fut = rx.next(); + fut.await.unwrap() + }) } pub fn batch_read_on_peer( @@ -508,7 +519,7 @@ pub fn async_read_index_on_peer( region: metapb::Region, key: &[u8], read_quorum: bool, -) -> mpsc::Receiver { +) -> BoxFuture<'static, RaftCmdResponse> { let node_id = peer.get_store_id(); let mut cmd = new_read_index_cmd(); cmd.mut_read_index().set_start_ts(u64::MAX); @@ -522,10 +533,30 @@ pub fn async_read_index_on_peer( read_quorum, ); request.mut_header().set_peer(peer); - let (tx, rx) = mpsc::sync_channel(1); + let (tx, mut rx) = future::bounded(1, future::WakePolicy::Immediately); let cb = Callback::read(Box::new(move |resp| drop(tx.send(resp.response)))); cluster.sim.wl().async_read(node_id, None, request, cb); - rx + Box::pin(async move { + let fut = rx.next(); + fut.await.unwrap() + }) +} + +pub fn async_command_on_node( + cluster: &mut Cluster, + node_id: u64, + request: RaftCmdRequest, +) -> BoxFuture<'static, RaftCmdResponse> { + let (cb, mut rx) = make_cb(&request); + cluster + .sim + .rl() + .async_command_on_node(node_id, request, cb) + .unwrap(); + Box::pin(async move { + let fut = rx.next(); + fut.await.unwrap() + }) } pub fn must_get_value(resp: &RaftCmdResponse) -> Vec { @@ -645,11 +676,11 @@ pub fn configure_for_request_snapshot(cluster: &mut Cluster) { cluster.cfg.raft_store.raft_log_gc_size_limit = Some(ReadableSize::mb(20)); } -pub fn configure_for_hibernate(cluster: &mut Cluster) { +pub fn configure_for_hibernate(config: &mut Config) { // Uses long check interval to make leader keep sleeping during tests. - cluster.cfg.raft_store.abnormal_leader_missing_duration = ReadableDuration::secs(20); - cluster.cfg.raft_store.max_leader_missing_duration = ReadableDuration::secs(40); - cluster.cfg.raft_store.peer_stale_state_check_interval = ReadableDuration::secs(10); + config.raft_store.abnormal_leader_missing_duration = ReadableDuration::secs(20); + config.raft_store.max_leader_missing_duration = ReadableDuration::secs(40); + config.raft_store.peer_stale_state_check_interval = ReadableDuration::secs(10); } pub fn configure_for_snapshot(config: &mut Config) { @@ -842,6 +873,32 @@ pub fn must_kv_read_equal(client: &TikvClient, ctx: Context, key: Vec, val: assert_eq!(get_resp.take_value(), val); } +pub fn must_kv_read_not_found(client: &TikvClient, ctx: Context, key: Vec, ts: u64) { + let mut get_req = GetRequest::default(); + get_req.set_context(ctx); + get_req.set_key(key); + get_req.set_version(ts); + + for _ in 1..250 { + let get_resp = client.kv_get(&get_req).unwrap(); + if get_resp.has_region_error() || get_resp.has_error() { + thread::sleep(Duration::from_millis(20)); + } else if get_resp.get_not_found() { + return; + } + } + + // Last try + let get_resp = client.kv_get(&get_req).unwrap(); + assert!( + !get_resp.has_region_error(), + "{:?}", + get_resp.get_region_error() + ); + assert!(!get_resp.has_error(), "{:?}", get_resp.get_error()); + assert!(get_resp.get_not_found()); +} + pub fn write_and_read_key( client: &TikvClient, ctx: &Context, @@ -1430,3 +1487,36 @@ pub fn wait_for_synced(cluster: &mut Cluster, node_id: u64, regio } assert!(snapshot.ext().is_max_ts_synced()); } + +pub fn test_delete_range(cluster: &mut Cluster, cf: CfName) { + let data_set: Vec<_> = (1..500) + .map(|i| { + ( + format!("key{:08}", i).into_bytes(), + format!("value{}", i).into_bytes(), + ) + }) + .collect(); + for kvs in data_set.chunks(50) { + let requests = kvs.iter().map(|(k, v)| new_put_cf_cmd(cf, k, v)).collect(); + // key9 is always the last region. + cluster.batch_put(b"key9", requests).unwrap(); + } + + // delete_range request with notify_only set should not actually delete data. + cluster.must_notify_delete_range_cf(cf, b"", b""); + + let mut rng = rand::thread_rng(); + for _ in 0..50 { + let (k, v) = data_set.choose(&mut rng).unwrap(); + assert_eq!(cluster.get_cf(cf, k).unwrap(), *v); + } + + // Empty keys means the whole range. + cluster.must_delete_range_cf(cf, b"", b""); + + for _ in 0..50 { + let k = &data_set.choose(&mut rng).unwrap().0; + assert!(cluster.get_cf(cf, k).is_none()); + } +} diff --git a/components/tikv_util/src/future.rs b/components/tikv_util/src/future.rs index 7b22bebb482..875a8d97811 100644 --- a/components/tikv_util/src/future.rs +++ b/components/tikv_util/src/future.rs @@ -1,6 +1,7 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. use std::{ + borrow::BorrowMut, cell::UnsafeCell, sync::{ atomic::{AtomicU8, Ordering}, @@ -15,7 +16,7 @@ use futures::{ task::{self, ArcWake, Context, Poll}, }; -use crate::callback::must_call; +use crate::{callback::must_call, timer::GLOBAL_TIMER_HANDLE}; /// Generates a paired future and callback so that when callback is being /// called, its result is automatically passed as a future result. @@ -209,6 +210,28 @@ pub fn try_poll(f: impl Future) -> Option { }) } +// Run a future with a timeout on the current thread. Returns Err if times out. +#[allow(clippy::result_unit_err)] +pub fn block_on_timeout(mut fut: B, dur: std::time::Duration) -> Result +where + F: std::future::Future + Unpin, + B: BorrowMut, +{ + use futures_util::compat::Future01CompatExt; + + let mut timeout = GLOBAL_TIMER_HANDLE + .delay(std::time::Instant::now() + dur) + .compat() + .fuse(); + let mut f = fut.borrow_mut().fuse(); + futures::executor::block_on(async { + futures::select! { + _ = timeout => Err(()), + item = f => Ok(item), + } + }) +} + #[cfg(test)] mod tests { use std::sync::atomic::AtomicUsize; diff --git a/components/tikv_util/src/macros.rs b/components/tikv_util/src/macros.rs index 10889046a3b..75323426f70 100644 --- a/components/tikv_util/src/macros.rs +++ b/components/tikv_util/src/macros.rs @@ -11,7 +11,7 @@ macro_rules! box_err { e.into() }); ($f:tt, $($arg:expr),+) => ({ - box_err!(format!($f, $($arg),+)) + $crate::box_err!(format!($f, $($arg),+)) }); } diff --git a/components/tikv_util/src/mpsc/future.rs b/components/tikv_util/src/mpsc/future.rs index 00598f5295d..4492e33a933 100644 --- a/components/tikv_util/src/mpsc/future.rs +++ b/components/tikv_util/src/mpsc/future.rs @@ -6,14 +6,18 @@ use std::{ pin::Pin, sync::atomic::{self, AtomicUsize, Ordering}, task::{Context, Poll}, + time::Duration, }; +pub use crossbeam::channel::{RecvTimeoutError, TryRecvError}; use crossbeam::{ - channel::{SendError, TryRecvError}, + channel::SendError, queue::{ArrayQueue, SegQueue}, }; use futures::{task::AtomicWaker, Stream, StreamExt}; +use crate::future::block_on_timeout; + enum QueueType { Unbounded(SegQueue), Bounded(ArrayQueue), @@ -176,6 +180,15 @@ impl Receiver { } Err(TryRecvError::Disconnected) } + + pub fn recv_timeout(&mut self, dur: Duration) -> Result { + let fut = self.next(); + match block_on_timeout(fut, dur) { + Ok(Some(v)) => Ok(v), + Ok(None) => Err(RecvTimeoutError::Disconnected), + Err(_) => Err(RecvTimeoutError::Timeout), + } + } } impl Drop for Receiver { diff --git a/components/tracker/src/lib.rs b/components/tracker/src/lib.rs index fafd8415039..0682439bb45 100644 --- a/components/tracker/src/lib.rs +++ b/components/tracker/src/lib.rs @@ -68,7 +68,12 @@ impl Tracker { self.metrics.wf_commit_log_nanos - self.metrics.wf_batch_wait_nanos, ); detail.set_apply_batch_wait_nanos(self.metrics.apply_wait_nanos); - detail.set_apply_log_nanos(self.metrics.apply_time_nanos - self.metrics.apply_wait_nanos); + // When async_prewrite_apply is set, the `apply_time_nanos` could be less than + // apply_wait_nanos. + if self.metrics.apply_time_nanos > self.metrics.apply_wait_nanos { + detail + .set_apply_log_nanos(self.metrics.apply_time_nanos - self.metrics.apply_wait_nanos); + } detail.set_apply_mutex_lock_nanos(self.metrics.apply_mutex_lock_nanos); detail.set_apply_write_leader_wait_nanos(self.metrics.apply_thread_wait_nanos); detail.set_apply_write_wal_nanos(self.metrics.apply_write_wal_nanos); diff --git a/components/txn_types/src/lock.rs b/components/txn_types/src/lock.rs index c8e37823bc4..103353318e0 100644 --- a/components/txn_types/src/lock.rs +++ b/components/txn_types/src/lock.rs @@ -409,6 +409,7 @@ impl Lock { key: &Key, ts: TimeStamp, bypass_locks: &TsSet, + is_replica_read: bool, ) -> Result<()> { if lock.ts > ts || lock.lock_type == LockType::Lock @@ -429,6 +430,14 @@ impl Lock { let raw_key = key.to_raw()?; + // Disable replica read for autocommit max ts read, to avoid breaking + // linearizability. See https://github.com/pingcap/tidb/issues/43583 for details. + if ts == TimeStamp::max() && is_replica_read { + return Err(Error::from(ErrorInner::KeyIsLocked( + lock.into_owned().into_lock_info(raw_key), + ))); + } + if ts == TimeStamp::max() && raw_key == lock.primary && !lock.use_async_commit { // When `ts == TimeStamp::max()` (which means to get latest committed version // for primary key), and current key is the primary key, we ignore @@ -478,7 +487,7 @@ impl Lock { iso_level: IsolationLevel, ) -> Result<()> { match iso_level { - IsolationLevel::Si => Lock::check_ts_conflict_si(lock, key, ts, bypass_locks), + IsolationLevel::Si => Lock::check_ts_conflict_si(lock, key, ts, bypass_locks, false), IsolationLevel::RcCheckTs => { Lock::check_ts_conflict_rc_check_ts(lock, key, ts, bypass_locks) } @@ -486,6 +495,22 @@ impl Lock { } } + pub fn check_ts_conflict_for_replica_read( + lock: Cow<'_, Self>, + key: &Key, + ts: TimeStamp, + bypass_locks: &TsSet, + iso_level: IsolationLevel, + ) -> Result<()> { + match iso_level { + IsolationLevel::Si => Lock::check_ts_conflict_si(lock, key, ts, bypass_locks, true), + IsolationLevel::RcCheckTs => { + unreachable!() + } + _ => Ok(()), + } + } + pub fn is_pessimistic_txn(&self) -> bool { !self.for_update_ts.is_zero() } diff --git a/etc/error_code.toml b/etc/error_code.toml index 4fae4d9ea57..839c4f33f32 100644 --- a/etc/error_code.toml +++ b/etc/error_code.toml @@ -753,6 +753,11 @@ error = ''' KV:Storage:LockIfExistsFailed ''' +["KV:Storage:PrimaryMismatch"] +error = ''' +KV:Storage:PrimaryMismatch +''' + ["KV:Storage:Unknown"] error = ''' KV:Storage:Unknown diff --git a/proxy_components/engine_tiflash/src/misc.rs b/proxy_components/engine_tiflash/src/misc.rs index 127df8b697f..ae5bb63efeb 100644 --- a/proxy_components/engine_tiflash/src/misc.rs +++ b/proxy_components/engine_tiflash/src/misc.rs @@ -1,8 +1,9 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. +use engine_rocks::get_range_stats; use engine_traits::{ CfNamesExt, DeleteStrategy, ImportExt, IterOptions, Iterable, Iterator, MiscExt, Mutable, - Range, Result, SstWriter, SstWriterBuilder, WriteBatch, WriteBatchExt, ALL_CFS, + Range, RangeStats, Result, SstWriter, SstWriterBuilder, WriteBatch, WriteBatchExt, ALL_CFS, }; use tikv_util::{box_try, keybuilder::KeyBuilder}; @@ -336,15 +337,8 @@ impl MiscExt for RocksEngine { Ok(total) } - fn get_range_entries_and_versions( - &self, - cf: &str, - start: &[u8], - end: &[u8], - ) -> Result> { - Ok(crate::properties::get_range_entries_and_versions( - self, cf, start, end, - )) + fn get_range_stats(&self, cf: &str, start: &[u8], end: &[u8]) -> Result> { + Ok(get_range_stats(&self.rocks, cf, start, end)) } fn is_stalled_or_stopped(&self) -> bool { diff --git a/proxy_components/engine_tiflash/src/sst.rs b/proxy_components/engine_tiflash/src/sst.rs index 0518dd7feb5..e0d9d818b42 100644 --- a/proxy_components/engine_tiflash/src/sst.rs +++ b/proxy_components/engine_tiflash/src/sst.rs @@ -2,9 +2,10 @@ use std::{path::PathBuf, sync::Arc}; +use engine_rocks::encryption::WrappedEncryptionKeyManager; use engine_traits::{ - Error, ExternalSstFileInfo, IterOptions, Iterator, RefIterable, Result, SstCompressionType, - SstExt, SstMetaInfo, SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, + EncryptionKeyManager, Error, ExternalSstFileInfo, IterOptions, Iterator, RefIterable, Result, + SstCompressionType, SstExt, SstMetaInfo, SstReader, SstWriter, SstWriterBuilder, CF_DEFAULT, }; use fail::fail_point; use kvproto::import_sstpb::SstMeta; @@ -13,6 +14,7 @@ use rocksdb::{ EnvOptions, ExternalSstFileInfo as RawExternalSstFileInfo, SequentialFile, SstFileReader, SstFileWriter, DB, }; +use tikv_util::box_err; use crate::{engine::RocksEngine, options::RocksReadOptions, r2e}; @@ -63,6 +65,14 @@ impl SstReader for RocksSstReader { fn open(path: &str) -> Result { Self::open_with_env(path, None) } + fn open_encrypted(path: &str, mgr: Arc) -> Result { + let env = Env::new_key_managed_encrypted_env( + Arc::default(), + WrappedEncryptionKeyManager::new(mgr), + ) + .map_err(|err| Error::Other(box_err!("failed to open encrypted env: {}", err)))?; + Self::open_with_env(path, Some(Arc::new(env))) + } fn verify_checksum(&self) -> Result<()> { self.inner.verify_checksum().map_err(r2e)?; Ok(()) diff --git a/proxy_components/mock-engine-store/src/mock_cluster/v1/cluster.rs b/proxy_components/mock-engine-store/src/mock_cluster/v1/cluster.rs index 602365c786f..c7ddcba340a 100644 --- a/proxy_components/mock-engine-store/src/mock_cluster/v1/cluster.rs +++ b/proxy_components/mock-engine-store/src/mock_cluster/v1/cluster.rs @@ -118,7 +118,7 @@ pub trait Simulator { timeout: Duration, ) -> Result { let node_id = request.get_header().get_peer().get_store_id(); - let (cb, rx) = make_cb(&request); + let (cb, mut rx) = make_cb(&request); self.async_read(node_id, batch_id, request, cb); rx.recv_timeout(timeout) .map_err(|_| Error::Timeout(format!("request timeout for {:?}", timeout))) @@ -138,7 +138,7 @@ pub trait Simulator { request: RaftCmdRequest, timeout: Duration, ) -> Result { - let (cb, rx) = make_cb(&request); + let (cb, mut rx) = make_cb(&request); match self.async_command_on_node(node_id, request, cb) { Ok(()) => {} diff --git a/proxy_components/mock-engine-store/src/mock_cluster/v1/node.rs b/proxy_components/mock-engine-store/src/mock_cluster/v1/node.rs index ff7dd4bd144..79102eb44b7 100644 --- a/proxy_components/mock-engine-store/src/mock_cluster/v1/node.rs +++ b/proxy_components/mock-engine-store/src/mock_cluster/v1/node.rs @@ -272,6 +272,7 @@ impl Simulator for NodeCluster { let simulate_trans = SimulateTransport::new(self.trans.clone()); let mut raft_store = cfg.raft_store.clone(); + raft_store.optimize_for(false); raft_store .validate( cfg.coprocessor.region_split_size(), @@ -405,6 +406,7 @@ impl Simulator for NodeCluster { ); let mut raftstore_cfg = cfg.tikv.raft_store.clone(); + raftstore_cfg.optimize_for(false); raftstore_cfg .validate( cfg.coprocessor.region_split_size(), diff --git a/proxy_components/mock-engine-store/src/mock_cluster/v1/server.rs b/proxy_components/mock-engine-store/src/mock_cluster/v1/server.rs index 71d6257c193..287f57bd22c 100644 --- a/proxy_components/mock-engine-store/src/mock_cluster/v1/server.rs +++ b/proxy_components/mock-engine-store/src/mock_cluster/v1/server.rs @@ -182,7 +182,8 @@ impl ServerCluster { worker.scheduler(), Arc::new(ThreadLoadPool::with_threshold(usize::MAX)), ); - let raft_client = RaftClient::new(conn_builder); + // TODO Set store_id to 0 as a workaround. + let raft_client = RaftClient::new(0, conn_builder); ServerCluster { metas: HashMap::default(), addrs: map, @@ -470,6 +471,7 @@ impl ServerCluster { let apply_router = system.apply_router(); // Create node. let mut raft_store = cfg.raft_store.clone(); + raft_store.optimize_for(false); raft_store .validate( cfg.coprocessor.region_split_size(), @@ -768,7 +770,7 @@ impl Simulator for ServerCluster { timeout: Duration, ) -> Result { let node_id = request.get_header().get_peer().get_store_id(); - let (cb, rx) = test_raftstore::make_cb(&request); + let (cb, mut rx) = test_raftstore::make_cb(&request); self.async_read(node_id, batch_id, request, cb); rx.recv_timeout(timeout) .map_err(|_| RaftError::Timeout(format!("request timeout for {:?}", timeout))) @@ -780,7 +782,7 @@ impl Simulator for ServerCluster { request: RaftCmdRequest, timeout: Duration, ) -> Result { - let (cb, rx) = test_raftstore::make_cb(&request); + let (cb, mut rx) = test_raftstore::make_cb(&request); match self.async_command_on_node(node_id, request, cb) { Ok(()) => {} diff --git a/proxy_components/proxy_server/src/run.rs b/proxy_components/proxy_server/src/run.rs index 84e079787df..46fd97e77fe 100644 --- a/proxy_components/proxy_server/src/run.rs +++ b/proxy_components/proxy_server/src/run.rs @@ -78,6 +78,7 @@ use tikv::{ read_pool::{build_yatp_read_pool, ReadPool, ReadPoolConfigManager}, server::{ config::{Config as ServerConfig, ServerConfigManager}, + debug::{Debugger, DebuggerImpl}, gc_worker::GcWorker, raftkv::ReplicaReadLockChecker, resolve, @@ -1033,6 +1034,7 @@ impl TiKvServer { let server_config = Arc::new(VersionTrack::new(self.core.config.server.clone())); + self.core.config.raft_store.optimize_for(false); self.core .config .raft_store @@ -1311,17 +1313,21 @@ impl TiKvServer { } // Debug service. - let debug_service = DebugService::new( + let mut debugger = DebuggerImpl::new( Engines { kv: engines.engines.kv.rocks.clone(), raft: engines.engines.raft.clone(), }, - self.kv_statistics.clone(), - self.raft_statistics.clone(), - servers.server.get_debug_thread_pool().clone(), - engines.engine.raft_extension().clone(), self.cfg_controller.as_ref().unwrap().clone(), ); + debugger.set_kv_statistics(self.kv_statistics.clone()); + debugger.set_raft_statistics(self.raft_statistics.clone()); + + let debug_service = DebugService::new( + debugger, + servers.server.get_debug_thread_pool().clone(), + engines.engine.raft_extension(), + ); if servers .server .register_service(create_debug(debug_service)) diff --git a/proxy_tests/proxy/v2_compat/tablet_snapshot.rs b/proxy_tests/proxy/v2_compat/tablet_snapshot.rs index 653f3214a22..ecc3f907c79 100644 --- a/proxy_tests/proxy/v2_compat/tablet_snapshot.rs +++ b/proxy_tests/proxy/v2_compat/tablet_snapshot.rs @@ -227,7 +227,6 @@ fn test_v1_apply_snap_from_v2() { cluster_v2.run(); let region = cluster_v2.get_region(b""); - let region_id = region.get_id(); cluster_v2.must_split(®ion, b"k0010"); let s1_addr = cluster_v1.get_addr(1); diff --git a/scripts/test b/scripts/test index e4c46c6a620..d98f627dcf1 100755 --- a/scripts/test +++ b/scripts/test @@ -28,6 +28,7 @@ export DYLD_LIBRARY_PATH="${DYLD_LIBRARY_PATH}:${LOCAL_DIR}/lib" export LOG_LEVEL=DEBUG export RUST_BACKTRACE=full +echo ${TIKV_ENABLE_FEATURES} cargo $CUSTOM_TEST_COMMAND --workspace \ --exclude fuzz --exclude fuzzer-afl --exclude fuzzer-honggfuzz \ --exclude fuzzer-libfuzzer --exclude fuzz-targets \ diff --git a/src/config/mod.rs b/src/config/mod.rs index 9bee879b76f..31d1d15e763 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -3369,6 +3369,10 @@ impl TikvConfig { self.coprocessor .optimize_for(self.storage.engine == EngineType::RaftKv2); self.coprocessor.validate()?; + self.split + .optimize_for(self.coprocessor.region_split_size()); + self.raft_store + .optimize_for(self.storage.engine == EngineType::RaftKv2); self.raft_store.validate( self.coprocessor.region_split_size(), self.coprocessor.enable_region_bucket(), @@ -4293,6 +4297,13 @@ impl ConfigController { pub fn get_current(&self) -> TikvConfig { self.inner.read().unwrap().current.clone() } + + pub fn get_engine_type(&self) -> &'static str { + if self.get_current().storage.engine == EngineType::RaftKv2 { + return "partitioned-raft-kv"; + } + "raft-kv" + } } #[cfg(test)] @@ -4307,9 +4318,16 @@ mod tests { use grpcio::ResourceQuota; use itertools::Itertools; use kvproto::kvrpcpb::CommandPri; - use raftstore::coprocessor::{ - config::{RAFTSTORE_V2_SPLIT_SIZE, SPLIT_SIZE}, - region_info_accessor::MockRegionInfoProvider, + use raftstore::{ + coprocessor::{ + config::{RAFTSTORE_V2_SPLIT_SIZE, SPLIT_SIZE}, + region_info_accessor::MockRegionInfoProvider, + }, + store::{ + BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO, DEFAULT_BIG_REGION_BYTE_THRESHOLD, + DEFAULT_BIG_REGION_QPS_THRESHOLD, DEFAULT_BYTE_THRESHOLD, DEFAULT_QPS_THRESHOLD, + REGION_CPU_OVERLOAD_THRESHOLD_RATIO, + }, }; use slog::Level; use tempfile::Builder; @@ -5696,6 +5714,7 @@ mod tests { default_cfg.rocksdb.defaultcf.target_file_size_base = Some(ReadableSize::mb(8)); default_cfg.rocksdb.lockcf.target_file_size_base = Some(ReadableSize::mb(8)); default_cfg.raftdb.defaultcf.target_file_size_base = Some(ReadableSize::mb(8)); + default_cfg.raft_store.region_compact_check_step = Some(100); // Other special cases. cfg.pd.retry_max_count = default_cfg.pd.retry_max_count; // Both -1 and isize::MAX are the same. @@ -5738,18 +5757,37 @@ mod tests { #[test] fn test_region_size_config() { let mut default_cfg = TikvConfig::default(); - default_cfg.coprocessor.optimize_for(false); - default_cfg.coprocessor.validate().unwrap(); + default_cfg.storage.engine = EngineType::RaftKv; + default_cfg.validate().unwrap(); assert_eq!(default_cfg.coprocessor.region_split_size(), SPLIT_SIZE); assert!(!default_cfg.coprocessor.enable_region_bucket()); + assert_eq!(default_cfg.split.qps_threshold, DEFAULT_QPS_THRESHOLD); + assert_eq!( + default_cfg.split.region_cpu_overload_threshold_ratio, + REGION_CPU_OVERLOAD_THRESHOLD_RATIO + ); + assert_eq!(default_cfg.split.byte_threshold, DEFAULT_BYTE_THRESHOLD); + let mut default_cfg = TikvConfig::default(); - default_cfg.coprocessor.optimize_for(true); - default_cfg.coprocessor.validate().unwrap(); + default_cfg.storage.engine = EngineType::RaftKv2; + default_cfg.validate().unwrap(); assert_eq!( default_cfg.coprocessor.region_split_size(), RAFTSTORE_V2_SPLIT_SIZE ); + assert_eq!( + default_cfg.split.qps_threshold, + DEFAULT_BIG_REGION_QPS_THRESHOLD + ); + assert_eq!( + default_cfg.split.region_cpu_overload_threshold_ratio, + BIG_REGION_CPU_OVERLOAD_THRESHOLD_RATIO + ); + assert_eq!( + default_cfg.split.byte_threshold, + DEFAULT_BIG_REGION_BYTE_THRESHOLD + ); assert!(default_cfg.coprocessor.enable_region_bucket()); let mut default_cfg = TikvConfig::default(); diff --git a/src/lib.rs b/src/lib.rs index 4da16ee0e74..b3e9ebaf8e8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -28,6 +28,8 @@ #![feature(let_chains)] #![feature(read_buf)] #![feature(type_alias_impl_trait)] +#![allow(incomplete_features)] +#![feature(return_position_impl_trait_in_trait)] #[macro_use(fail_point)] extern crate fail; diff --git a/src/server/debug.rs b/src/server/debug.rs index c16621f4d85..7ce7c832f48 100644 --- a/src/server/debug.rs +++ b/src/server/debug.rs @@ -22,6 +22,7 @@ use engine_traits::{ }; use kvproto::{ debugpb::{self, Db as DbType}, + kvrpcpb::MvccInfo, metapb::{PeerRole, Region}, raft_serverpb::*, }; @@ -68,7 +69,7 @@ pub struct RegionInfo { } impl RegionInfo { - fn new( + pub fn new( raft_local: Option, raft_apply: Option, region_local: Option, @@ -125,8 +126,56 @@ trait InnerRocksEngineExtractor { fn get_db_from_type(&self, db: DbType) -> Result<&RocksEngine>; } +pub trait Debugger { + fn get(&self, db: DbType, cf: &str, key: &[u8]) -> Result>; + + fn raft_log(&self, region_id: u64, log_index: u64) -> Result; + + fn region_info(&self, region_id: u64) -> Result; + + fn region_size>(&self, region_id: u64, cfs: Vec) -> Result>; + + /// Scan MVCC Infos for given range `[start, end)`. + fn scan_mvcc( + &self, + start: &[u8], + end: &[u8], + limit: u64, + ) -> Result, MvccInfo)>> + Send>; + + /// Compact the cf[start..end) in the db. + fn compact( + &self, + db: DbType, + cf: &str, + start: &[u8], + end: &[u8], + threads: u32, + bottommost: BottommostLevelCompaction, + ) -> Result<()>; + + /// Get all regions holding region meta data from raft CF in KV storage. + fn get_all_regions_in_store(&self) -> Result>; + + fn get_store_ident(&self) -> Result; + + fn dump_kv_stats(&self) -> Result; + + fn dump_raft_stats(&self) -> Result; + + fn modify_tikv_config(&self, config_name: &str, config_value: &str) -> Result<()>; + + fn get_region_properties(&self, region_id: u64) -> Result>; + + fn reset_to_version(&self, version: u64); + + fn set_kv_statistics(&mut self, s: Option>); + + fn set_raft_statistics(&mut self, s: Option>); +} + #[derive(Clone)] -pub struct Debugger { +pub struct DebuggerImpl { engines: Engines, kv_statistics: Option>, raft_statistics: Option>, @@ -134,7 +183,7 @@ pub struct Debugger { cfg_controller: ConfigController, } -impl InnerRocksEngineExtractor for Debugger { +impl InnerRocksEngineExtractor for DebuggerImpl { default fn get_db_from_type(&self, db: DbType) -> Result<&RocksEngine> { match db { DbType::Kv => Ok(&self.engines.kv), @@ -144,7 +193,7 @@ impl InnerRocksEngineExtractor for Debugger { } } -impl InnerRocksEngineExtractor for Debugger { +impl InnerRocksEngineExtractor for DebuggerImpl { fn get_db_from_type(&self, db: DbType) -> Result<&RocksEngine> { match db { DbType::Kv => Ok(&self.engines.kv), @@ -154,13 +203,13 @@ impl InnerRocksEngineExtractor for Debugger { } } -impl Debugger { +impl DebuggerImpl { pub fn new( engines: Engines, cfg_controller: ConfigController, - ) -> Debugger { + ) -> DebuggerImpl { let reset_to_version_manager = ResetToVersionManager::new(engines.kv.clone()); - Debugger { + DebuggerImpl { engines, kv_statistics: None, raft_statistics: None, @@ -169,160 +218,10 @@ impl Debugger { } } - pub fn set_kv_statistics(&mut self, s: Option>) { - self.kv_statistics = s; - } - - pub fn set_raft_statistics(&mut self, s: Option>) { - self.raft_statistics = s; - } - pub fn get_engine(&self) -> &Engines { &self.engines } - pub fn dump_kv_stats(&self) -> Result { - let mut kv_str = box_try!(MiscExt::dump_stats(&self.engines.kv)); - if let Some(s) = self.kv_statistics.as_ref() && let Some(s) = s.to_string() { - kv_str.push_str(&s); - } - Ok(kv_str) - } - - pub fn dump_raft_stats(&self) -> Result { - let mut raft_str = box_try!(RaftEngine::dump_stats(&self.engines.raft)); - if let Some(s) = self.raft_statistics.as_ref() && let Some(s) = s.to_string() { - raft_str.push_str(&s); - } - Ok(raft_str) - } - - /// Get all regions holding region meta data from raft CF in KV storage. - pub fn get_all_regions_in_store(&self) -> Result> { - let db = &self.engines.kv; - let cf = CF_RAFT; - let start_key = keys::REGION_META_MIN_KEY; - let end_key = keys::REGION_META_MAX_KEY; - let mut regions = Vec::with_capacity(128); - box_try!(db.scan(cf, start_key, end_key, false, |key, _| { - let (id, suffix) = box_try!(keys::decode_region_meta_key(key)); - if suffix != keys::REGION_STATE_SUFFIX { - return Ok(true); - } - regions.push(id); - Ok(true) - })); - regions.sort_unstable(); - Ok(regions) - } - - pub fn get(&self, db: DbType, cf: &str, key: &[u8]) -> Result> { - validate_db_and_cf(db, cf)?; - let db = self.get_db_from_type(db)?; - match db.get_value_cf(cf, key) { - Ok(Some(v)) => Ok(v.to_vec()), - Ok(None) => Err(Error::NotFound(format!( - "value for key {:?} in db {:?}", - key, db - ))), - Err(e) => Err(box_err!(e)), - } - } - - pub fn raft_log(&self, region_id: u64, log_index: u64) -> Result { - if let Some(e) = box_try!(self.engines.raft.get_entry(region_id, log_index)) { - return Ok(e); - } - Err(Error::NotFound(format!( - "raft log for region {} at index {}", - region_id, log_index - ))) - } - - pub fn region_info(&self, region_id: u64) -> Result { - let raft_state = box_try!(self.engines.raft.get_raft_state(region_id)); - - let apply_state_key = keys::apply_state_key(region_id); - let apply_state = box_try!( - self.engines - .kv - .get_msg_cf::(CF_RAFT, &apply_state_key) - ); - - let region_state_key = keys::region_state_key(region_id); - let region_state = box_try!( - self.engines - .kv - .get_msg_cf::(CF_RAFT, ®ion_state_key) - ); - - match (raft_state, apply_state, region_state) { - (None, None, None) => Err(Error::NotFound(format!("info for region {}", region_id))), - (raft_state, apply_state, region_state) => { - Ok(RegionInfo::new(raft_state, apply_state, region_state)) - } - } - } - - pub fn region_size>( - &self, - region_id: u64, - cfs: Vec, - ) -> Result> { - let region_state_key = keys::region_state_key(region_id); - match self - .engines - .kv - .get_msg_cf::(CF_RAFT, ®ion_state_key) - { - Ok(Some(region_state)) => { - let region = region_state.get_region(); - let start_key = &keys::data_key(region.get_start_key()); - let end_key = &keys::data_end_key(region.get_end_key()); - let mut sizes = vec![]; - for cf in cfs { - let mut size = 0; - box_try!(self.engines.kv.scan( - cf.as_ref(), - start_key, - end_key, - false, - |k, v| { - size += k.len() + v.len(); - Ok(true) - } - )); - sizes.push((cf, size)); - } - Ok(sizes) - } - Ok(None) => Err(Error::NotFound(format!("none region {:?}", region_id))), - Err(e) => Err(box_err!(e)), - } - } - - /// Scan MVCC Infos for given range `[start, end)`. - pub fn scan_mvcc( - &self, - start: &[u8], - end: &[u8], - limit: u64, - ) -> Result> { - if end.is_empty() && limit == 0 { - return Err(Error::InvalidArgument("no limit and to_key".to_owned())); - } - MvccInfoIterator::new( - |cf, opts| { - let kv = &self.engines.kv; - kv.iterator_opt(cf, opts).map_err(|e| box_err!(e)) - }, - if start.is_empty() { None } else { Some(start) }, - if end.is_empty() { None } else { Some(end) }, - limit as usize, - ) - .map_err(|e| box_err!(e)) - } - /// Scan raw keys for given range `[start, end)` in given cf. pub fn raw_scan( &self, @@ -352,32 +251,6 @@ impl Debugger { Ok(res) } - /// Compact the cf[start..end) in the db. - pub fn compact( - &self, - db: DbType, - cf: &str, - start: &[u8], - end: &[u8], - threads: u32, - bottommost: BottommostLevelCompaction, - ) -> Result<()> { - validate_db_and_cf(db, cf)?; - let db = self.get_db_from_type(db)?; - let handle = box_try!(get_cf_handle(db.as_inner(), cf)); - let start = if start.is_empty() { None } else { Some(start) }; - let end = if end.is_empty() { None } else { Some(end) }; - info!("Debugger starts manual compact"; "db" => ?db, "cf" => cf); - let mut opts = CompactOptions::new(); - opts.set_max_subcompactions(threads as i32); - opts.set_exclusive_manual_compaction(false); - opts.set_bottommost_level_compaction(bottommost.0); - db.as_inner() - .compact_range_cf_opt(handle, &opts, start, end); - info!("Debugger finishes manual compact"; "db" => ?db, "cf" => cf); - Ok(()) - } - /// Set regions to tombstone by manual, and apply other status(such as /// peers, version, and key range) from `region` which comes from PD /// normally. @@ -859,7 +732,183 @@ impl Debugger { Ok(()) } - pub fn get_store_ident(&self) -> Result { + fn get_region_state(&self, region_id: u64) -> Result { + let region_state_key = keys::region_state_key(region_id); + let region_state = box_try!( + self.engines + .kv + .get_msg_cf::(CF_RAFT, ®ion_state_key) + ); + match region_state { + Some(v) => Ok(v), + None => Err(Error::NotFound(format!("region {}", region_id))), + } + } + + pub fn get_range_properties(&self, start: &[u8], end: &[u8]) -> Result> { + let mut props = dump_write_cf_properties( + &self.engines.kv, + &keys::data_key(start), + &keys::data_end_key(end), + )?; + let mut props1 = dump_default_cf_properties( + &self.engines.kv, + &keys::data_key(start), + &keys::data_end_key(end), + )?; + props.append(&mut props1); + Ok(props) + } +} + +impl Debugger for DebuggerImpl { + fn get(&self, db: DbType, cf: &str, key: &[u8]) -> Result> { + validate_db_and_cf(db, cf)?; + let db = self.get_db_from_type(db)?; + match db.get_value_cf(cf, key) { + Ok(Some(v)) => Ok(v.to_vec()), + Ok(None) => Err(Error::NotFound(format!( + "value for key {:?} in db {:?}", + key, db + ))), + Err(e) => Err(box_err!(e)), + } + } + + fn raft_log(&self, region_id: u64, log_index: u64) -> Result { + if let Some(e) = box_try!(self.engines.raft.get_entry(region_id, log_index)) { + return Ok(e); + } + Err(Error::NotFound(format!( + "raft log for region {} at index {}", + region_id, log_index + ))) + } + + fn region_info(&self, region_id: u64) -> Result { + let raft_state = box_try!(self.engines.raft.get_raft_state(region_id)); + + let apply_state_key = keys::apply_state_key(region_id); + let apply_state = box_try!( + self.engines + .kv + .get_msg_cf::(CF_RAFT, &apply_state_key) + ); + + let region_state_key = keys::region_state_key(region_id); + let region_state = box_try!( + self.engines + .kv + .get_msg_cf::(CF_RAFT, ®ion_state_key) + ); + + match (raft_state, apply_state, region_state) { + (None, None, None) => Err(Error::NotFound(format!("info for region {}", region_id))), + (raft_state, apply_state, region_state) => { + Ok(RegionInfo::new(raft_state, apply_state, region_state)) + } + } + } + + fn region_size>(&self, region_id: u64, cfs: Vec) -> Result> { + let region_state_key = keys::region_state_key(region_id); + match self + .engines + .kv + .get_msg_cf::(CF_RAFT, ®ion_state_key) + { + Ok(Some(region_state)) => { + let region = region_state.get_region(); + let start_key = &keys::data_key(region.get_start_key()); + let end_key = &keys::data_end_key(region.get_end_key()); + let mut sizes = vec![]; + for cf in cfs { + let mut size = 0; + box_try!(self.engines.kv.scan( + cf.as_ref(), + start_key, + end_key, + false, + |k, v| { + size += k.len() + v.len(); + Ok(true) + } + )); + sizes.push((cf, size)); + } + Ok(sizes) + } + Ok(None) => Err(Error::NotFound(format!("none region {:?}", region_id))), + Err(e) => Err(box_err!(e)), + } + } + + fn scan_mvcc( + &self, + start: &[u8], + end: &[u8], + limit: u64, + ) -> Result, MvccInfo)>> + Send> { + if end.is_empty() && limit == 0 { + return Err(Error::InvalidArgument("no limit and to_key".to_owned())); + } + MvccInfoIterator::new( + |cf, opts| { + let kv = &self.engines.kv; + kv.iterator_opt(cf, opts).map_err(|e| box_err!(e)) + }, + if start.is_empty() { None } else { Some(start) }, + if end.is_empty() { None } else { Some(end) }, + limit as usize, + ) + .map_err(|e| box_err!(e)) + } + + /// Compact the cf[start..end) in the db. + fn compact( + &self, + db: DbType, + cf: &str, + start: &[u8], + end: &[u8], + threads: u32, + bottommost: BottommostLevelCompaction, + ) -> Result<()> { + validate_db_and_cf(db, cf)?; + let db = self.get_db_from_type(db)?; + let handle = box_try!(get_cf_handle(db.as_inner(), cf)); + let start = if start.is_empty() { None } else { Some(start) }; + let end = if end.is_empty() { None } else { Some(end) }; + info!("Debugger starts manual compact"; "db" => ?db, "cf" => cf); + let mut opts = CompactOptions::new(); + opts.set_max_subcompactions(threads as i32); + opts.set_exclusive_manual_compaction(false); + opts.set_bottommost_level_compaction(bottommost.0); + db.as_inner() + .compact_range_cf_opt(handle, &opts, start, end); + info!("Debugger finishes manual compact"; "db" => ?db, "cf" => cf); + Ok(()) + } + + fn get_all_regions_in_store(&self) -> Result> { + let db = &self.engines.kv; + let cf = CF_RAFT; + let start_key = keys::REGION_META_MIN_KEY; + let end_key = keys::REGION_META_MAX_KEY; + let mut regions = Vec::with_capacity(128); + box_try!(db.scan(cf, start_key, end_key, false, |key, _| { + let (id, suffix) = box_try!(keys::decode_region_meta_key(key)); + if suffix != keys::REGION_STATE_SUFFIX { + return Ok(true); + } + regions.push(id); + Ok(true) + })); + regions.sort_unstable(); + Ok(regions) + } + + fn get_store_ident(&self) -> Result { let db = &self.engines.kv; db.get_msg::(keys::STORE_IDENT_KEY) .map_err(|e| box_err!(e)) @@ -869,7 +918,23 @@ impl Debugger { }) } - pub fn modify_tikv_config(&self, config_name: &str, config_value: &str) -> Result<()> { + fn dump_kv_stats(&self) -> Result { + let mut kv_str = box_try!(MiscExt::dump_stats(&self.engines.kv)); + if let Some(s) = self.kv_statistics.as_ref() && let Some(s) = s.to_string() { + kv_str.push_str(&s); + } + Ok(kv_str) + } + + fn dump_raft_stats(&self) -> Result { + let mut raft_str = box_try!(RaftEngine::dump_stats(&self.engines.raft)); + if let Some(s) = self.raft_statistics.as_ref() && let Some(s) = s.to_string() { + raft_str.push_str(&s); + } + Ok(raft_str) + } + + fn modify_tikv_config(&self, config_name: &str, config_value: &str) -> Result<()> { if let Err(e) = self.cfg_controller.update_config(config_name, config_value) { return Err(Error::Other( format!("failed to update config, err: {:?}", e).into(), @@ -878,20 +943,7 @@ impl Debugger { Ok(()) } - fn get_region_state(&self, region_id: u64) -> Result { - let region_state_key = keys::region_state_key(region_id); - let region_state = box_try!( - self.engines - .kv - .get_msg_cf::(CF_RAFT, ®ion_state_key) - ); - match region_state { - Some(v) => Ok(v), - None => Err(Error::NotFound(format!("region {}", region_id))), - } - } - - pub fn get_region_properties(&self, region_id: u64) -> Result> { + fn get_region_properties(&self, region_id: u64) -> Result> { let region_state = self.get_region_state(region_id)?; let region = region_state.get_region(); let start = keys::enc_start_key(region); @@ -919,23 +971,16 @@ impl Debugger { Ok(res) } - pub fn get_range_properties(&self, start: &[u8], end: &[u8]) -> Result> { - let mut props = dump_write_cf_properties( - &self.engines.kv, - &keys::data_key(start), - &keys::data_end_key(end), - )?; - let mut props1 = dump_default_cf_properties( - &self.engines.kv, - &keys::data_key(start), - &keys::data_end_key(end), - )?; - props.append(&mut props1); - Ok(props) + fn reset_to_version(&self, version: u64) { + self.reset_to_version_manager.start(version.into()); } - pub fn reset_to_version(&self, version: u64) { - self.reset_to_version_manager.start(version.into()); + fn set_kv_statistics(&mut self, s: Option>) { + self.kv_statistics = s; + } + + fn set_raft_statistics(&mut self, s: Option>) { + self.raft_statistics = s; } } @@ -1560,16 +1605,16 @@ mod tests { } } - fn new_debugger() -> Debugger { + fn new_debugger() -> DebuggerImpl { let tmp = Builder::new().prefix("test_debug").tempdir().unwrap(); let path = tmp.path().to_str().unwrap(); let engine = engine_rocks::util::new_engine(path, ALL_CFS).unwrap(); let engines = Engines::new(engine.clone(), engine); - Debugger::new(engines, ConfigController::default()) + DebuggerImpl::new(engines, ConfigController::default()) } - impl Debugger { + impl DebuggerImpl { fn set_store_id(&self, store_id: u64) { let mut ident = self.get_store_ident().unwrap_or_default(); ident.set_store_id(store_id); diff --git a/src/server/debug2.rs b/src/server/debug2.rs new file mode 100644 index 00000000000..bea3da7ca4a --- /dev/null +++ b/src/server/debug2.rs @@ -0,0 +1,1034 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use std::sync::Arc; + +use engine_rocks::{ + raw::CompactOptions, util::get_cf_handle, RocksEngine, RocksEngineIterator, RocksStatistics, +}; +use engine_traits::{ + CachedTablet, Iterable, Peekable, RaftEngine, TabletContext, TabletRegistry, CF_DEFAULT, + CF_LOCK, CF_WRITE, +}; +use keys::{data_key, DATA_MAX_KEY, DATA_PREFIX_KEY}; +use kvproto::{ + debugpb::Db as DbType, + kvrpcpb::MvccInfo, + metapb, + raft_serverpb::{PeerState, RegionLocalState, StoreIdent}, +}; +use nom::AsBytes; +use raft::prelude::Entry; +use raftstore::store::util::check_key_in_region; + +use super::debug::{BottommostLevelCompaction, Debugger, RegionInfo}; +use crate::{ + config::ConfigController, + server::debug::{Error, Result}, + storage::mvcc::{MvccInfoCollector, MvccInfoScanner}, +}; + +// return the region containing the seek_key or the next region if not existed +fn seek_region( + seek_key: &[u8], + sorted_region_states: &[RegionLocalState], +) -> Option { + if sorted_region_states.is_empty() { + return None; + } + + let idx = match sorted_region_states + .binary_search_by(|state| state.get_region().get_start_key().cmp(seek_key)) + { + Ok(idx) => return Some(sorted_region_states[idx].clone()), + Err(idx) => idx, + }; + + // idx == 0 means seek_key is less than the first region's start key + if idx == 0 { + return Some(sorted_region_states[idx].clone()); + } + + let region_state = &sorted_region_states[idx - 1]; + if check_key_in_region(seek_key, region_state.get_region()).is_err() { + return sorted_region_states.get(idx).cloned(); + } + + Some(region_state.clone()) +} + +pub struct MvccInfoIteratorV2 { + scanner: Option>, + tablet_reg: TabletRegistry, + sorted_region_states: Vec, + cur_region: metapb::Region, + start: Vec, + end: Vec, + limit: usize, + count: usize, +} + +impl MvccInfoIteratorV2 { + pub fn new( + sorted_region_states: Vec, + tablet_reg: TabletRegistry, + start: &[u8], + end: &[u8], + limit: usize, + ) -> Result { + let seek_key = if start.is_empty() { + start + } else { + &start[DATA_PREFIX_KEY.len()..] + }; + + if let Some(mut first_region_state) = seek_region(seek_key, &sorted_region_states) { + let mut tablet_cache = get_tablet_cache( + &tablet_reg, + first_region_state.get_region().get_id(), + Some(first_region_state.clone()), + )?; + + let tablet = tablet_cache.latest().unwrap(); + let scanner = Some( + MvccInfoScanner::new( + |cf, opts| tablet.iterator_opt(cf, opts).map_err(|e| box_err!(e)), + if start.is_empty() { None } else { Some(start) }, + if end.is_empty() { None } else { Some(end) }, + MvccInfoCollector::default(), + ) + .map_err(|e| -> Error { box_err!(e) })?, + ); + + Ok(MvccInfoIteratorV2 { + scanner, + tablet_reg, + sorted_region_states, + cur_region: first_region_state.take_region(), + start: start.to_vec(), + end: end.to_vec(), + limit, + count: 0, + }) + } else { + Ok(MvccInfoIteratorV2 { + scanner: None, + tablet_reg, + sorted_region_states, + cur_region: metapb::Region::default(), + start: start.to_vec(), + end: end.to_vec(), + limit, + count: 0, + }) + } + } +} + +impl Iterator for MvccInfoIteratorV2 { + type Item = raftstore::Result<(Vec, MvccInfo)>; + + fn next(&mut self) -> Option, MvccInfo)>> { + if self.scanner.is_none() || (self.limit != 0 && self.count >= self.limit) { + return None; + } + + loop { + match self.scanner.as_mut().unwrap().next_item() { + Ok(Some(item)) => { + self.count += 1; + return Some(Ok(item)); + } + Ok(None) => { + let cur_end_key = self.cur_region.get_end_key(); + if cur_end_key.is_empty() { + return None; + } + + let next_region_state = seek_region(cur_end_key, &self.sorted_region_states); + if next_region_state.is_none() { + self.scanner = None; + return None; + } + + let next_region_state = next_region_state.unwrap(); + if &self.cur_region == next_region_state.get_region() { + return None; + } + self.cur_region = next_region_state.get_region().clone(); + let mut tablet_cache = get_tablet_cache( + &self.tablet_reg, + next_region_state.get_region().get_id(), + Some(next_region_state.clone()), + ) + .unwrap(); + let tablet = tablet_cache.latest().unwrap(); + self.scanner = Some( + MvccInfoScanner::new( + |cf, opts| tablet.iterator_opt(cf, opts).map_err(|e| box_err!(e)), + if self.start.is_empty() { + None + } else { + Some(self.start.as_bytes()) + }, + if self.end.is_empty() { + None + } else { + Some(self.end.as_bytes()) + }, + MvccInfoCollector::default(), + ) + .unwrap(), + ); + } + Err(e) => return Some(Err(e)), + } + } + } +} + +// Debugger for raftstore-v2 +#[derive(Clone)] +pub struct DebuggerImplV2 { + tablet_reg: TabletRegistry, + raft_engine: ER, + kv_statistics: Option>, + raft_statistics: Option>, + _cfg_controller: ConfigController, +} + +impl DebuggerImplV2 { + pub fn new( + tablet_reg: TabletRegistry, + raft_engine: ER, + cfg_controller: ConfigController, + ) -> Self { + println!("Debugger for raftstore-v2 is used"); + DebuggerImplV2 { + tablet_reg, + raft_engine, + _cfg_controller: cfg_controller, + kv_statistics: None, + raft_statistics: None, + } + } +} + +impl Debugger for DebuggerImplV2 { + fn get(&self, db: DbType, cf: &str, key: &[u8]) -> Result> { + validate_db_and_cf(db, cf)?; + let region_state = + find_region_state_by_key(&self.raft_engine, &key[DATA_PREFIX_KEY.len()..])?; + let mut tablet_cache = get_tablet_cache( + &self.tablet_reg, + region_state.get_region().get_id(), + Some(region_state), + )?; + let tablet = tablet_cache.latest().unwrap(); + match tablet.get_value_cf(cf, key) { + Ok(Some(v)) => Ok(v.to_vec()), + Ok(None) => Err(Error::NotFound(format!( + "value for key {:?} in db {:?}", + key, db + ))), + Err(e) => Err(box_err!(e)), + } + } + + fn raft_log(&self, region_id: u64, log_index: u64) -> Result { + if let Some(log) = box_try!(self.raft_engine.get_entry(region_id, log_index)) { + return Ok(log); + } + Err(Error::NotFound(format!( + "raft log for region {} at index {}", + region_id, log_index + ))) + } + + fn region_info(&self, region_id: u64) -> Result { + let raft_state = box_try!(self.raft_engine.get_raft_state(region_id)); + let apply_state = box_try!(self.raft_engine.get_apply_state(region_id, u64::MAX)); + let region_state = box_try!(self.raft_engine.get_region_state(region_id, u64::MAX)); + + match (raft_state, apply_state, region_state) { + (None, None, None) => Err(Error::NotFound(format!("info for region {}", region_id))), + (raft_state, apply_state, region_state) => { + Ok(RegionInfo::new(raft_state, apply_state, region_state)) + } + } + } + + fn region_size>(&self, region_id: u64, cfs: Vec) -> Result> { + match self.raft_engine.get_region_state(region_id, u64::MAX) { + Ok(Some(region_state)) => { + if region_state.get_state() != PeerState::Normal { + return Err(Error::NotFound(format!( + "region {:?} has been deleted", + region_id + ))); + } + let region = region_state.get_region(); + let start_key = &keys::data_key(region.get_start_key()); + let end_key = &keys::data_end_key(region.get_end_key()); + let mut sizes = vec![]; + let mut tablet_cache = + get_tablet_cache(&self.tablet_reg, region.id, Some(region_state))?; + let tablet = tablet_cache.latest().unwrap(); + for cf in cfs { + let mut size = 0; + box_try!(tablet.scan(cf.as_ref(), start_key, end_key, false, |k, v| { + size += k.len() + v.len(); + Ok(true) + })); + sizes.push((cf, size)); + } + Ok(sizes) + } + Ok(None) => Err(Error::NotFound(format!("none region {:?}", region_id))), + Err(e) => Err(box_err!(e)), + } + } + + fn scan_mvcc( + &self, + start: &[u8], + end: &[u8], + limit: u64, + ) -> Result, MvccInfo)>> + Send> { + if end.is_empty() && limit == 0 { + return Err(Error::InvalidArgument("no limit and to_key".to_owned())); + } + if !end.is_empty() && start > end { + return Err(Error::InvalidArgument( + "start key should not be larger than end key".to_owned(), + )); + } + + let mut region_states = vec![]; + self.raft_engine + .for_each_raft_group::(&mut |region_id| { + let region_state = self + .raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(); + if region_state.state == PeerState::Normal { + region_states.push(region_state); + } + Ok(()) + }) + .unwrap(); + + region_states.sort_by(|r1, r2| { + r1.get_region() + .get_start_key() + .cmp(r2.get_region().get_start_key()) + }); + + MvccInfoIteratorV2::new( + region_states, + self.tablet_reg.clone(), + start, + end, + limit as usize, + ) + } + + fn compact( + &self, + db: DbType, + cf: &str, + start: &[u8], + end: &[u8], + threads: u32, + bottommost: BottommostLevelCompaction, + ) -> Result<()> { + validate_db_and_cf(db, cf)?; + if db == DbType::Raft { + return Err(box_err!("Get raft db is not allowed")); + } + let mut compactions = vec![]; + self.raft_engine + .for_each_raft_group::(&mut |region_id| { + let region_state = self + .raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(); + if region_state.state != PeerState::Normal { + return Ok(()); + } + + if let Some((start_key, end_key)) = + range_in_region((start, end), region_state.get_region()) + { + let start = if start_key.is_empty() { + None + } else { + Some(data_key(start_key)) + }; + let end = if end_key.is_empty() { + None + } else { + Some(data_key(end_key)) + }; + compactions.push((region_id, start, end, region_state)); + }; + + Ok(()) + }) + .unwrap(); + + for (region_id, start_key, end_key, region_state) in compactions { + let mut tablet_cache = + get_tablet_cache(&self.tablet_reg, region_id, Some(region_state))?; + let talbet = tablet_cache.latest().unwrap(); + info!("Debugger starts manual compact"; "talbet" => ?talbet, "cf" => cf); + let mut opts = CompactOptions::new(); + opts.set_max_subcompactions(threads as i32); + opts.set_exclusive_manual_compaction(false); + opts.set_bottommost_level_compaction(bottommost.0); + let handle = box_try!(get_cf_handle(talbet.as_inner(), cf)); + talbet.as_inner().compact_range_cf_opt( + handle, + &opts, + start_key.as_ref().map(|k| k.as_bytes()), + end_key.as_ref().map(|k| k.as_bytes()), + ); + info!("Debugger finishes manual compact"; "db" => ?db, "cf" => cf); + } + + Ok(()) + } + + fn get_all_regions_in_store(&self) -> Result> { + let mut region_ids = vec![]; + self.raft_engine + .for_each_raft_group::(&mut |region_id| { + region_ids.push(region_id); + Ok(()) + }) + .unwrap(); + Ok(region_ids) + } + + fn dump_kv_stats(&self) -> Result { + unimplemented!() + } + + fn dump_raft_stats(&self) -> Result { + unimplemented!() + } + + fn modify_tikv_config(&self, _config_name: &str, _config_value: &str) -> Result<()> { + unimplemented!() + } + + fn get_store_ident(&self) -> Result { + unimplemented!() + } + + fn get_region_properties(&self, _region_id: u64) -> Result> { + unimplemented!() + } + + fn reset_to_version(&self, _version: u64) { + unimplemented!() + } + + fn set_kv_statistics(&mut self, s: Option>) { + self.kv_statistics = s; + } + + fn set_raft_statistics(&mut self, s: Option>) { + self.raft_statistics = s; + } +} + +fn validate_db_and_cf(db: DbType, cf: &str) -> Result<()> { + match (db, cf) { + (DbType::Kv, CF_DEFAULT) + | (DbType::Kv, CF_WRITE) + | (DbType::Kv, CF_LOCK) + | (DbType::Raft, CF_DEFAULT) => Ok(()), + _ => Err(Error::InvalidArgument(format!( + "invalid cf {:?} for db {:?}", + cf, db + ))), + } +} + +// Return the overlap range (without data prefix) of the `range` in region or +// None if they are exclusive +// Note: generally, range should start with `DATA_PREFIX_KEY`, but they can also +// be empty in case of compacting whole cluster for example. +// Note: the range end being `DATA_PREFIX_KEY` and `DATA_MAX_KEY` both means the +// largest key +fn range_in_region<'a>( + range: (&'a [u8], &'a [u8]), + region: &'a metapb::Region, +) -> Option<(&'a [u8], &'a [u8])> { + let range_start = if !range.0.is_empty() { + range.0 + } else { + DATA_PREFIX_KEY + }; + + let range_end = if !range.1.is_empty() && range.1 != DATA_MAX_KEY { + range.1 + } else { + DATA_PREFIX_KEY + }; + + if range_start == DATA_PREFIX_KEY && range_end == DATA_PREFIX_KEY { + return Some((region.get_start_key(), region.get_end_key())); + } else if range_start == DATA_PREFIX_KEY { + assert!(range_end.starts_with(DATA_PREFIX_KEY)); + if region.get_start_key() < &range_end[DATA_PREFIX_KEY.len()..] { + return Some(( + region.get_start_key(), + smaller_key( + &range_end[DATA_PREFIX_KEY.len()..], + region.get_end_key(), + true, + ), + )); + } + None + } else if range_end == DATA_PREFIX_KEY { + assert!(range_start.starts_with(DATA_PREFIX_KEY)); + if &range_start[DATA_PREFIX_KEY.len()..] < region.get_end_key() + || region.get_end_key().is_empty() + { + return Some(( + larger_key( + &range_start[DATA_PREFIX_KEY.len()..], + region.get_start_key(), + false, + ), + region.get_end_key(), + )); + } + None + } else { + assert!(range_start.starts_with(DATA_PREFIX_KEY)); + assert!(range_end.starts_with(DATA_PREFIX_KEY)); + let start_key = larger_key( + &range_start[DATA_PREFIX_KEY.len()..], + region.get_start_key(), + false, + ); + let end_key = smaller_key( + &range_end[DATA_PREFIX_KEY.len()..], + region.get_end_key(), + true, + ); + if start_key < end_key { + return Some((start_key, end_key)); + } + None + } +} + +fn find_region_state_by_key( + raft_engine: &ER, + key: &[u8], +) -> Result { + let mut region_ids = vec![]; + raft_engine + .for_each_raft_group::(&mut |region_id| { + region_ids.push(region_id); + Ok(()) + }) + .unwrap(); + + for region_id in region_ids { + if let Ok(Some(region_state)) = raft_engine.get_region_state(region_id, u64::MAX) { + let region = region_state.get_region(); + if check_key_in_region(key, region).is_ok() { + if region_state.get_state() != PeerState::Normal { + break; + } + return Ok(region_state); + } + } + } + + Err(Error::NotFound(format!( + "Not found region containing {:?}", + key + ))) +} + +fn get_tablet_cache( + tablet_reg: &TabletRegistry, + region_id: u64, + state: Option, +) -> Result> { + if let Some(tablet_cache) = tablet_reg.get(region_id) { + Ok(tablet_cache) + } else { + let region_state = state.unwrap(); + let ctx = TabletContext::new(region_state.get_region(), Some(region_state.tablet_index)); + match tablet_reg.load(ctx, false) { + Ok(tablet_cache) => Ok(tablet_cache), + Err(e) => { + println!( + "tablet load failed, region_state {:?}", + region_state.get_state() + ); + return Err(box_err!(e)); + } + } + } +} + +// `key1` and `key2` should both be start_key or end_key. +fn smaller_key<'a>(key1: &'a [u8], key2: &'a [u8], end_key: bool) -> &'a [u8] { + if end_key && key1.is_empty() { + return key2; + } + if end_key && key2.is_empty() { + return key1; + } + if key1 < key2 { + return key1; + } + key2 +} + +// `key1` and `key2` should both be start_key or end_key. +fn larger_key<'a>(key1: &'a [u8], key2: &'a [u8], end_key: bool) -> &'a [u8] { + if end_key && key1.is_empty() { + return key1; + } + if end_key && key2.is_empty() { + return key2; + } + if key1 < key2 { + return key2; + } + key1 +} + +#[cfg(test)] +mod tests { + use std::path::Path; + + use engine_traits::{RaftLogBatch, SyncMutable, CF_DEFAULT, CF_LOCK, CF_WRITE}; + use kvproto::{metapb, raft_serverpb::*}; + use raft::prelude::EntryType; + use raft_log_engine::RaftLogEngine; + + use super::*; + use crate::{ + config::TikvConfig, + server::KvEngineFactoryBuilder, + storage::{txn::tests::must_prewrite_put, TestEngineBuilder}, + }; + + const INITIAL_TABLET_INDEX: u64 = 5; + const INITIAL_APPLY_INDEX: u64 = 5; + + fn new_debugger(path: &Path) -> DebuggerImplV2 { + let mut cfg = TikvConfig::default(); + cfg.storage.data_dir = path.to_str().unwrap().to_string(); + cfg.raft_store.raftdb_path = cfg.infer_raft_db_path(None).unwrap(); + cfg.raft_engine.mut_config().dir = cfg.infer_raft_engine_path(None).unwrap(); + let cache = cfg + .storage + .block_cache + .build_shared_cache(cfg.storage.engine); + let env = cfg.build_shared_rocks_env(None, None).unwrap(); + + let factory = KvEngineFactoryBuilder::new(env, &cfg, cache).build(); + let reg = TabletRegistry::new(Box::new(factory), path).unwrap(); + + let raft_engine = RaftLogEngine::new(cfg.raft_engine.config(), None, None).unwrap(); + + DebuggerImplV2::new(reg, raft_engine, ConfigController::default()) + } + + #[test] + fn test_get() { + let dir = test_util::temp_dir("test-debugger", false); + let debugger = new_debugger(dir.path()); + let raft_engine = &debugger.raft_engine; + let region_id = 1; + + let mut region = metapb::Region::default(); + region.set_id(region_id); + region.set_start_key(b"k10".to_vec()); + region.set_end_key(b"k20".to_vec()); + let mut state = RegionLocalState::default(); + state.set_region(region.clone()); + state.set_tablet_index(5); + + let ctx = TabletContext::new(®ion, Some(5)); + let mut tablet_cache = debugger.tablet_reg.load(ctx, true).unwrap(); + let tablet = tablet_cache.latest().unwrap(); + + let mut wb = raft_engine.log_batch(10); + wb.put_region_state(region_id, 10, &state).unwrap(); + raft_engine.consume(&mut wb, true).unwrap(); + + let cfs = vec![CF_DEFAULT, CF_LOCK, CF_WRITE]; + let (k, v) = (keys::data_key(b"k15"), b"v"); + for cf in &cfs { + tablet.put_cf(cf, k.as_slice(), v).unwrap(); + } + + for cf in &cfs { + let got = debugger.get(DbType::Kv, cf, &k).unwrap(); + assert_eq!(&got, v); + } + + match debugger.get(DbType::Kv, CF_DEFAULT, b"k15") { + Err(Error::NotFound(_)) => (), + _ => panic!("expect Error::NotFound(_)"), + } + + let mut wb = raft_engine.log_batch(10); + state.set_state(PeerState::Tombstone); + wb.put_region_state(region_id, 10, &state).unwrap(); + raft_engine.consume(&mut wb, true).unwrap(); + for cf in &cfs { + debugger.get(DbType::Kv, cf, &k).unwrap_err(); + } + } + + #[test] + fn test_raft_log() { + let dir = test_util::temp_dir("test-debugger", false); + let debugger = new_debugger(dir.path()); + let raft_engine = &debugger.raft_engine; + let (region_id, log_index) = (1, 1); + + let mut entry = Entry::default(); + entry.set_term(1); + entry.set_index(1); + entry.set_entry_type(EntryType::EntryNormal); + entry.set_data(vec![42].into()); + let mut wb = raft_engine.log_batch(10); + RaftLogBatch::append(&mut wb, region_id, None, vec![entry.clone()]).unwrap(); + raft_engine.consume(&mut wb, true).unwrap(); + + assert_eq!(debugger.raft_log(region_id, log_index).unwrap(), entry); + match debugger.raft_log(region_id + 1, log_index + 1) { + Err(Error::NotFound(_)) => (), + _ => panic!("expect Error::NotFound(_)"), + } + } + + #[test] + fn test_region_info() { + let dir = test_util::temp_dir("test-debugger", false); + let debugger = new_debugger(dir.path()); + let raft_engine = &debugger.raft_engine; + let region_id = 1; + + let mut wb = raft_engine.log_batch(10); + let mut raft_state = RaftLocalState::default(); + raft_state.set_last_index(42); + RaftLogBatch::put_raft_state(&mut wb, region_id, &raft_state).unwrap(); + + let mut apply_state = RaftApplyState::default(); + apply_state.set_applied_index(42); + RaftLogBatch::put_apply_state(&mut wb, region_id, 42, &apply_state).unwrap(); + + let mut region_state = RegionLocalState::default(); + region_state.set_state(PeerState::Tombstone); + RaftLogBatch::put_region_state(&mut wb, region_id, 42, ®ion_state).unwrap(); + + raft_engine.consume(&mut wb, true).unwrap(); + + assert_eq!( + debugger.region_info(region_id).unwrap(), + RegionInfo::new(Some(raft_state), Some(apply_state), Some(region_state)) + ); + match debugger.region_info(region_id + 1) { + Err(Error::NotFound(_)) => (), + _ => panic!("expect Error::NotFound(_)"), + } + } + + #[test] + fn test_region_size() { + let dir = test_util::temp_dir("test-debugger", false); + let debugger = new_debugger(dir.path()); + let raft_engine = &debugger.raft_engine; + let region_id = 1; + + let mut region = metapb::Region::default(); + region.set_id(region_id); + region.set_start_key(b"k10".to_vec()); + region.set_end_key(b"k20".to_vec()); + let mut state = RegionLocalState::default(); + state.set_region(region.clone()); + state.set_tablet_index(5); + + let ctx = TabletContext::new(®ion, Some(5)); + let mut tablet_cache = debugger.tablet_reg.load(ctx, true).unwrap(); + let tablet = tablet_cache.latest().unwrap(); + + let mut wb = raft_engine.log_batch(10); + wb.put_region_state(region_id, 10, &state).unwrap(); + raft_engine.consume(&mut wb, true).unwrap(); + + let cfs = vec![CF_DEFAULT, CF_LOCK, CF_WRITE]; + let (k, v) = (keys::data_key(b"k15"), b"v"); + for cf in &cfs { + tablet.put_cf(cf, k.as_slice(), v).unwrap(); + } + + let sizes = debugger.region_size(region_id, cfs.clone()).unwrap(); + assert_eq!(sizes.len(), 3); + for (cf, size) in sizes { + cfs.iter().find(|&&c| c == cf).unwrap(); + assert_eq!(size, k.len() + v.len()); + } + + // test for region that has not been trimmed + let (k, v) = (keys::data_key(b"k05"), b"v"); + let k1 = keys::data_key(b"k25"); + for cf in &cfs { + tablet.put_cf(cf, k.as_slice(), v).unwrap(); + tablet.put_cf(cf, k1.as_slice(), v).unwrap(); + } + + let sizes = debugger.region_size(region_id, cfs.clone()).unwrap(); + assert_eq!(sizes.len(), 3); + for (cf, size) in sizes { + cfs.iter().find(|&&c| c == cf).unwrap(); + assert_eq!(size, k.len() + v.len()); + } + + state.set_state(PeerState::Tombstone); + let mut wb = raft_engine.log_batch(10); + wb.put_region_state(region_id, 10, &state).unwrap(); + raft_engine.consume(&mut wb, true).unwrap(); + debugger.region_size(region_id, cfs.clone()).unwrap_err(); + } + + // For simplicity, the format of the key is inline with data in + // prepare_data_on_disk + fn extract_key(key: &[u8]) -> &[u8] { + &key[1..4] + } + + // Prepare some data + // Data for each region: + // Region 1: k00 .. k04 + // Region 2: k05 .. k09 + // Region 3: k10 .. k14 + // Region 4: k15 .. k19 + // Region 5: k20 .. k24 + // Region 6: k26 .. k28 + fn prepare_data_on_disk(path: &Path) { + let mut cfg = TikvConfig::default(); + cfg.storage.data_dir = path.to_str().unwrap().to_string(); + cfg.raft_store.raftdb_path = cfg.infer_raft_db_path(None).unwrap(); + cfg.raft_engine.mut_config().dir = cfg.infer_raft_engine_path(None).unwrap(); + cfg.gc.enable_compaction_filter = false; + let cache = cfg + .storage + .block_cache + .build_shared_cache(cfg.storage.engine); + let env = cfg.build_shared_rocks_env(None, None).unwrap(); + + let factory = KvEngineFactoryBuilder::new(env, &cfg, cache).build(); + let reg = TabletRegistry::new(Box::new(factory), path).unwrap(); + + let raft_engine = RaftLogEngine::new(cfg.raft_engine.config(), None, None).unwrap(); + let mut wb = raft_engine.log_batch(5); + for i in 0..6 { + let mut region = metapb::Region::default(); + let start_key = format!("k{:02}", i * 5); + let end_key = format!("k{:02}", (i + 1) * 5); + region.set_id(i + 1); + region.set_start_key(start_key.into_bytes()); + region.set_end_key(end_key.into_bytes()); + let mut region_state = RegionLocalState::default(); + region_state.set_tablet_index(INITIAL_TABLET_INDEX); + if region.get_id() == 4 { + region_state.set_state(PeerState::Tombstone); + } else if region.get_id() == 6 { + region.set_start_key(b"k26".to_vec()); + region.set_end_key(b"k28".to_vec()); + } + region_state.set_region(region); + + let tablet_path = reg.tablet_path(i + 1, INITIAL_TABLET_INDEX); + // Use tikv_kv::RocksEngine instead of loading tablet from registry in order to + // use prewrite method to prepare mvcc data + let mut engine = TestEngineBuilder::new().path(tablet_path).build().unwrap(); + for i in i * 5..(i + 1) * 5 { + let key = format!("zk{:02}", i); + let val = format!("val{:02}", i); + // Use prewrite only is enough for preparing mvcc data + must_prewrite_put( + &mut engine, + key.as_bytes(), + val.as_bytes(), + key.as_bytes(), + 10, + ); + } + + wb.put_region_state(i + 1, INITIAL_APPLY_INDEX, ®ion_state) + .unwrap(); + } + raft_engine.consume(&mut wb, true).unwrap(); + } + + #[test] + fn test_scan_mvcc() { + let dir = test_util::temp_dir("test-debugger", false); + prepare_data_on_disk(dir.path()); + let debugger = new_debugger(dir.path()); + // Test scan with bad start, end or limit. + assert!(debugger.scan_mvcc(b"z", b"", 0).is_err()); + assert!(debugger.scan_mvcc(b"z", b"x", 3).is_err()); + + let verify_scanner = + |range, scanner: &mut dyn Iterator, MvccInfo)>>| { + for i in range { + let key = format!("k{:02}", i).into_bytes(); + assert_eq!(key, extract_key(&scanner.next().unwrap().unwrap().0)); + } + }; + + // full scann + let mut scanner = debugger.scan_mvcc(b"", b"", 100).unwrap(); + verify_scanner(0..15, &mut scanner); + verify_scanner(20..25, &mut scanner); + verify_scanner(26..28, &mut scanner); + assert!(scanner.next().is_none()); + + // Range has more elements than limit + let mut scanner = debugger.scan_mvcc(b"zk01", b"zk09", 5).unwrap(); + verify_scanner(1..6, &mut scanner); + assert!(scanner.next().is_none()); + + // Range has less elements than limit + let mut scanner = debugger.scan_mvcc(b"zk07", b"zk10", 10).unwrap(); + verify_scanner(7..10, &mut scanner); + assert!(scanner.next().is_none()); + + // Start from the key where no region contains it + let mut scanner = debugger.scan_mvcc(b"zk16", b"", 100).unwrap(); + verify_scanner(20..25, &mut scanner); + verify_scanner(26..28, &mut scanner); + assert!(scanner.next().is_none()); + + // Scan a range not existed in the cluster + let mut scanner = debugger.scan_mvcc(b"zk16", b"zk19", 100).unwrap(); + assert!(scanner.next().is_none()); + + // The end key is less than the start_key of the first region + let mut scanner = debugger.scan_mvcc(b"", b"zj", 100).unwrap(); + assert!(scanner.next().is_none()); + } + + #[test] + fn test_compact() { + let dir = test_util::temp_dir("test-debugger", false); + let debugger = new_debugger(dir.path()); + let compact = |db, cf| debugger.compact(db, cf, &[0], &[0xFF], 1, Some("skip").into()); + compact(DbType::Kv, CF_DEFAULT).unwrap(); + compact(DbType::Kv, CF_LOCK).unwrap(); + compact(DbType::Kv, CF_WRITE).unwrap(); + compact(DbType::Raft, CF_DEFAULT).unwrap_err(); + } + + #[test] + fn test_range_in_region() { + let mut region = metapb::Region::default(); + region.set_start_key(b"k01".to_vec()); + region.set_end_key(b"k10".to_vec()); + + let ranges = vec![ + ("", "", "k01", "k10"), + ("z", "z", "k01", "k10"), + ("zk00", "", "k01", "k10"), + ("zk00", "z", "k01", "k10"), + ("", "zk11", "k01", "k10"), + ("z", "zk11", "k01", "k10"), + ("zk02", "zk07", "k02", "k07"), + ("zk00", "zk07", "k01", "k07"), + ("zk02", "zk11", "k02", "k10"), + ("zk02", "{", "k02", "k10"), + ]; + + for (range_start, range_end, expect_start, expect_end) in ranges { + assert_eq!( + (expect_start.as_bytes(), expect_end.as_bytes()), + range_in_region((range_start.as_bytes(), range_end.as_bytes()), ®ion).unwrap() + ); + } + + let ranges = vec![("zk05", "zk02"), ("zk11", ""), ("", "zk00")]; + for (range_start, range_end) in ranges { + assert!( + range_in_region((range_start.as_bytes(), range_end.as_bytes()), ®ion).is_none() + ); + } + + region.set_start_key(b"".to_vec()); + region.set_end_key(b"k10".to_vec()); + + let ranges = vec![ + ("", "", "", "k10"), + ("z", "z", "", "k10"), + ("zk00", "", "k00", "k10"), + ("zk00", "z", "k00", "k10"), + ("", "zk11", "", "k10"), + ("z", "zk11", "", "k10"), + ("zk02", "zk07", "k02", "k07"), + ("zk02", "zk11", "k02", "k10"), + ("zk02", "{", "k02", "k10"), + ]; + + for (range_start, range_end, expect_start, expect_end) in ranges { + assert_eq!( + (expect_start.as_bytes(), expect_end.as_bytes()), + range_in_region((range_start.as_bytes(), range_end.as_bytes()), ®ion).unwrap() + ); + } + + let ranges = vec![("zk05", "zk02"), ("zk11", "")]; + for (range_start, range_end) in ranges { + assert!( + range_in_region((range_start.as_bytes(), range_end.as_bytes()), ®ion).is_none() + ); + } + + region.set_start_key(b"k01".to_vec()); + region.set_end_key(b"".to_vec()); + + let ranges = vec![ + ("", "", "k01", ""), + ("z", "z", "k01", ""), + ("zk00", "", "k01", ""), + ("zk00", "z", "k01", ""), + ("", "zk11", "k01", "k11"), + ("z", "zk11", "k01", "k11"), + ("zk02", "zk07", "k02", "k07"), + ("zk02", "zk11", "k02", "k11"), + ("zk02", "{", "k02", ""), + ]; + + for (range_start, range_end, expect_start, expect_end) in ranges { + assert_eq!( + (expect_start.as_bytes(), expect_end.as_bytes()), + range_in_region((range_start.as_bytes(), range_end.as_bytes()), ®ion).unwrap() + ); + } + + let ranges = vec![("zk05", "zk02"), ("", "zk00")]; + for (range_start, range_end) in ranges { + assert!( + range_in_region((range_start.as_bytes(), range_end.as_bytes()), ®ion).is_none() + ); + } + } +} diff --git a/src/server/mod.rs b/src/server/mod.rs index 773e2040f17..00d9fe70d4f 100644 --- a/src/server/mod.rs +++ b/src/server/mod.rs @@ -5,6 +5,7 @@ mod raft_client; pub mod config; pub mod debug; +pub mod debug2; mod engine_factory; pub mod errors; pub mod gc_worker; @@ -34,7 +35,7 @@ pub use self::{ metrics::{CONFIG_ROCKSDB_GAUGE, CPU_CORES_QUOTA_GAUGE, MEM_TRACE_SUM_GAUGE}, node::Node, proxy::{build_forward_option, get_target_address, Proxy}, - raft_client::{ConnectionBuilder, RaftClient}, + raft_client::{ConnectionBuilder, MetadataSourceStoreId, RaftClient}, raftkv::RaftKv, raftkv2::{Extension, NodeV2, RaftKv2}, resolve::{PdStoreAddrResolver, StoreAddrResolver}, diff --git a/src/server/raft_client.rs b/src/server/raft_client.rs index 17de1d3365d..f30e5b36045 100644 --- a/src/server/raft_client.rs +++ b/src/server/raft_client.rs @@ -25,8 +25,8 @@ use futures::{ }; use futures_timer::Delay; use grpcio::{ - Channel, ChannelBuilder, ClientCStreamReceiver, ClientCStreamSender, Environment, - RpcStatusCode, WriteFlags, + CallOption, Channel, ChannelBuilder, ClientCStreamReceiver, ClientCStreamSender, Environment, + MetadataBuilder, RpcStatusCode, WriteFlags, }; use kvproto::{ raft_serverpb::{Done, RaftMessage, RaftSnapshotData}, @@ -50,6 +50,21 @@ use crate::server::{ StoreAddrResolver, }; +pub struct MetadataSourceStoreId {} + +impl MetadataSourceStoreId { + pub const KEY: &str = "source_store_id"; + + pub fn parse(value: &[u8]) -> u64 { + let value = std::str::from_utf8(value).unwrap(); + value.parse::().unwrap() + } + + pub fn format(id: u64) -> String { + format!("{}", id) + } +} + static CONN_ID: AtomicI32 = AtomicI32::new(0); const _ON_RESOLVE_FP: &str = "transport_snapshot_on_resolve"; @@ -616,6 +631,7 @@ impl ConnectionBuilder { /// StreamBackEnd watches lifetime of a connection and handles reconnecting, /// spawn new RPC. struct StreamBackEnd { + self_store_id: u64, store_id: u64, queue: Arc, builder: ConnectionBuilder, @@ -697,7 +713,8 @@ where } fn batch_call(&self, client: &TikvClient, addr: String) -> oneshot::Receiver { - let (batch_sink, batch_stream) = client.batch_raft().unwrap(); + let (batch_sink, batch_stream) = client.batch_raft_opt(self.get_call_option()).unwrap(); + let (tx, rx) = oneshot::channel(); let mut call = RaftCall { sender: AsyncRaftSender { @@ -721,7 +738,8 @@ where } fn call(&self, client: &TikvClient, addr: String) -> oneshot::Receiver { - let (sink, stream) = client.raft().unwrap(); + let (sink, stream) = client.raft_opt(self.get_call_option()).unwrap(); + let (tx, rx) = oneshot::channel(); let mut call = RaftCall { sender: AsyncRaftSender { @@ -742,6 +760,15 @@ where }); rx } + + fn get_call_option(&self) -> CallOption { + let mut metadata = MetadataBuilder::with_capacity(1); + let value = MetadataSourceStoreId::format(self.self_store_id); + metadata + .add_str(MetadataSourceStoreId::KEY, &value) + .unwrap(); + CallOption::default().headers(metadata.build()) + } } async fn maybe_backoff(backoff: Duration, last_wake_time: &mut Option) { @@ -782,7 +809,6 @@ async fn start( R: RaftExtension + Unpin + Send + 'static, { let mut last_wake_time = None; - let mut first_time = true; let backoff_duration = back_end.builder.cfg.value().raft_client_max_backoff.0; let mut addr_channel = None; loop { @@ -828,15 +854,10 @@ async fn start( // shutdown. back_end.clear_pending_message("unreachable"); - // broadcast is time consuming operation which would blocks raftstore, so report - // unreachable only once until being connected again. - if first_time { - first_time = false; - back_end - .builder - .router - .report_store_unreachable(back_end.store_id); - } + back_end + .builder + .router + .report_store_unreachable(back_end.store_id); continue; } else { debug!("connection established"; "store_id" => back_end.store_id, "addr" => %addr); @@ -868,7 +889,6 @@ async fn start( .router .report_store_unreachable(back_end.store_id); addr_channel = None; - first_time = false; } } } @@ -926,6 +946,7 @@ struct CachedQueue { /// raft_client.flush(); /// ``` pub struct RaftClient { + self_store_id: u64, pool: Arc>, cache: LruCache<(u64, usize), CachedQueue>, need_flush: Vec<(u64, usize)>, @@ -940,13 +961,14 @@ where S: StoreAddrResolver + Send + 'static, R: RaftExtension + Unpin + Send + 'static, { - pub fn new(builder: ConnectionBuilder) -> Self { + pub fn new(self_store_id: u64, builder: ConnectionBuilder) -> Self { let future_pool = Arc::new( yatp::Builder::new(thd_name!("raft-stream")) .max_thread_count(1) .build_future_pool(), ); RaftClient { + self_store_id, pool: Arc::default(), cache: LruCache::with_capacity_and_sample(0, 7), need_flush: vec![], @@ -982,6 +1004,7 @@ where queue.set_conn_state(ConnState::Paused); } let back_end = StreamBackEnd { + self_store_id: self.self_store_id, store_id, queue: queue.clone(), builder: self.builder.clone(), @@ -1143,6 +1166,7 @@ where { fn clone(&self) -> Self { RaftClient { + self_store_id: self.self_store_id, pool: self.pool.clone(), cache: LruCache::with_capacity_and_sample(0, 7), need_flush: vec![], diff --git a/src/server/raftkv/mod.rs b/src/server/raftkv/mod.rs index 697a4b39d63..039f987c398 100644 --- a/src/server/raftkv/mod.rs +++ b/src/server/raftkv/mod.rs @@ -117,7 +117,7 @@ where Snap(RegionSnapshot), } -fn check_raft_cmd_response(resp: &mut RaftCmdResponse) -> Result<()> { +pub fn check_raft_cmd_response(resp: &mut RaftCmdResponse) -> Result<()> { if resp.get_header().has_error() { return Err(Error::RequestFailed(resp.take_header().take_error())); } @@ -735,7 +735,7 @@ impl ReadIndexObserver for ReplicaReadLockChecker { start_key.as_ref(), end_key.as_ref(), |key, lock| { - txn_types::Lock::check_ts_conflict( + txn_types::Lock::check_ts_conflict_for_replica_read( Cow::Borrowed(lock), key, start_ts, diff --git a/src/server/raftkv2/mod.rs b/src/server/raftkv2/mod.rs index 5434da9ce91..d4a158bffda 100644 --- a/src/server/raftkv2/mod.rs +++ b/src/server/raftkv2/mod.rs @@ -12,8 +12,11 @@ use std::{ use collections::HashSet; use engine_traits::{KvEngine, RaftEngine, CF_LOCK}; -use futures::{Future, Stream, StreamExt}; -use kvproto::raft_cmdpb::{CmdType, RaftCmdRequest, Request}; +use futures::{future::BoxFuture, Future, Stream, StreamExt}; +use kvproto::{ + kvrpcpb::Context, + raft_cmdpb::{AdminCmdType, CmdType, RaftCmdRequest, Request}, +}; pub use node::NodeV2; pub use raft_extension::Extension; use raftstore::store::{util::encode_start_ts_into_flag_data, RegionSnapshot}; @@ -29,7 +32,10 @@ use txn_types::{TxnExtra, TxnExtraScheduler, WriteBatchFlags}; use super::{ metrics::{ASYNC_REQUESTS_COUNTER_VEC, ASYNC_REQUESTS_DURATIONS_VEC}, - raftkv::{get_status_kind_from_engine_error, new_request_header}, + raftkv::{ + check_raft_cmd_response, get_status_kind_from_engine_error, new_flashback_req, + new_request_header, + }, }; struct Transform { @@ -150,7 +156,7 @@ impl tikv_kv::Engine for RaftKv2 { Ok(()) } - type SnapshotRes = impl Future> + Send; + type SnapshotRes = impl Future> + Send + 'static; fn async_snapshot(&mut self, mut ctx: tikv_kv::SnapContext<'_>) -> Self::SnapshotRes { let mut req = Request::default(); req.set_cmd_type(CmdType::Snap); @@ -169,10 +175,9 @@ impl tikv_kv::Engine for RaftKv2 { if ctx.pb_ctx.get_stale_read() && need_encoded_start_ts { flags |= WriteBatchFlags::STALE_READ.bits(); } - // TODO: flashback is not supported yet. - // if ctx.allowed_in_flashback { - // flags |= WriteBatchFlags::FLASHBACK.bits(); - // } + if ctx.allowed_in_flashback { + flags |= WriteBatchFlags::FLASHBACK.bits(); + } header.set_flags(flags); // Encode `start_ts` in `flag_data` for the check of stale read and flashback. if need_encoded_start_ts { @@ -235,10 +240,9 @@ impl tikv_kv::Engine for RaftKv2 { if batch.extra.one_pc { flags |= WriteBatchFlags::ONE_PC.bits(); } - // TODO: flashback is not supported yet. - // if batch.extra.allowed_in_flashback { - // flags |= WriteBatchFlags::FLASHBACK.bits(); - // } + if batch.extra.allowed_in_flashback { + flags |= WriteBatchFlags::FLASHBACK.bits(); + } header.set_flags(flags); self.schedule_txn_extra(batch.extra); @@ -313,4 +317,51 @@ impl tikv_kv::Engine for RaftKv2 { } } } + + fn start_flashback( + &self, + ctx: &Context, + start_ts: u64, + ) -> BoxFuture<'static, tikv_kv::Result<()>> { + // Send an `AdminCmdType::PrepareFlashback` to prepare the raftstore for the + // later flashback. Once invoked, we will update the persistent region meta and + // the memory state of the flashback in Peer FSM to reject all read, write + // and scheduling operations for this region when propose/apply before we + // start the actual data flashback transaction command in the next phase. + let mut req = new_flashback_req(ctx, AdminCmdType::PrepareFlashback); + req.mut_admin_request() + .mut_prepare_flashback() + .set_start_ts(start_ts); + exec_admin(&self.router, req) + } + + fn end_flashback(&self, ctx: &Context) -> BoxFuture<'static, tikv_kv::Result<()>> { + // Send an `AdminCmdType::FinishFlashback` to unset the persistence state + // in `RegionLocalState` and region's meta, and when that admin cmd is applied, + // will update the memory state of the flashback + let req = new_flashback_req(ctx, AdminCmdType::FinishFlashback); + exec_admin(&self.router, req) + } +} + +fn exec_admin( + router: &RaftRouter, + req: RaftCmdRequest, +) -> BoxFuture<'static, tikv_kv::Result<()>> { + let region_id = req.get_header().get_region_id(); + let admin_type = req.get_admin_request().get_cmd_type(); + let (msg, sub) = PeerMsg::admin_command(req); + let res = router.check_send(region_id, msg); + Box::pin(async move { + res?; + let mut resp = sub.result().await.ok_or_else(|| -> tikv_kv::Error { + box_err!( + "region {} exec_admin {:?} without response", + region_id, + admin_type + ) + })?; + check_raft_cmd_response(&mut resp)?; + Ok(()) + }) } diff --git a/src/server/server.rs b/src/server/server.rs index 45778835d29..8a50f44f363 100644 --- a/src/server/server.rs +++ b/src/server/server.rs @@ -176,7 +176,7 @@ where lazy_worker.scheduler(), grpc_thread_load.clone(), ); - let raft_client = RaftClient::new(conn_builder); + let raft_client = RaftClient::new(store_id, conn_builder); let trans = ServerTransport::new(raft_client); health_service.set_serving_status("", ServingStatus::NotServing); diff --git a/src/server/service/debug.rs b/src/server/service/debug.rs index e0ec9173ad5..7b2a694c99a 100644 --- a/src/server/service/debug.rs +++ b/src/server/service/debug.rs @@ -1,9 +1,5 @@ // Copyright 2017 TiKV Project Authors. Licensed under Apache-2.0. -use std::sync::Arc; - -use engine_rocks::{RocksEngine, RocksStatistics}; -use engine_traits::{Engines, RaftEngine}; use futures::{ future::{Future, FutureExt, TryFutureExt}, sink::SinkExt, @@ -18,10 +14,7 @@ use tikv_kv::RaftExtension; use tikv_util::metrics; use tokio::runtime::Handle; -use crate::{ - config::ConfigController, - server::debug::{Debugger, Error, Result}, -}; +use crate::server::debug::{Debugger, Error, Result}; fn error_to_status(e: Error) -> RpcStatus { let (code, msg) = match e { @@ -45,26 +38,24 @@ fn error_to_grpc_error(tag: &'static str, e: Error) -> GrpcError { /// Service handles the RPC messages for the `Debug` service. #[derive(Clone)] -pub struct Service { +pub struct Service +where + T: RaftExtension, + D: Debugger, +{ pool: Handle, - debugger: Debugger, + debugger: D, raft_router: T, } -impl Service { +impl Service +where + T: RaftExtension, + D: Debugger, +{ /// Constructs a new `Service` with `Engines`, a `RaftExtension` and a /// `GcWorker`. - pub fn new( - engines: Engines, - kv_statistics: Option>, - raft_statistics: Option>, - pool: Handle, - raft_router: T, - cfg_controller: ConfigController, - ) -> Self { - let mut debugger = Debugger::new(engines, cfg_controller); - debugger.set_kv_statistics(kv_statistics); - debugger.set_raft_statistics(raft_statistics); + pub fn new(debugger: D, pool: Handle, raft_router: T) -> Self { Service { pool, debugger, @@ -93,7 +84,11 @@ impl Service { } } -impl debugpb::Debug for Service { +impl debugpb::Debug for Service +where + T: RaftExtension + 'static, + D: Debugger + Clone + Send + 'static, +{ fn get(&mut self, ctx: RpcContext<'_>, mut req: GetRequest, sink: UnarySink) { const TAG: &str = "debug_get"; diff --git a/src/server/service/kv.rs b/src/server/service/kv.rs index 2c77ee4e0bd..9895067fcb3 100644 --- a/src/server/service/kv.rs +++ b/src/server/service/kv.rs @@ -21,7 +21,7 @@ use raft::eraftpb::MessageType; use raftstore::{ store::{ memory::{MEMTRACE_APPLYS, MEMTRACE_RAFT_ENTRIES, MEMTRACE_RAFT_MESSAGES}, - metrics::RAFT_ENTRIES_CACHES_GAUGE, + metrics::{MESSAGE_RECV_BY_STORE, RAFT_ENTRIES_CACHES_GAUGE}, CheckLeaderTask, }, Error as RaftStoreError, Result as RaftStoreResult, @@ -45,7 +45,7 @@ use crate::{ coprocessor_v2, forward_duplex, forward_unary, log_net_error, server::{ gc_worker::GcWorker, load_statistics::ThreadLoadPool, metrics::*, snap::Task as SnapTask, - Error, Proxy, Result as ServerResult, + Error, MetadataSourceStoreId, Proxy, Result as ServerResult, }, storage::{ self, @@ -168,9 +168,23 @@ impl Service { ch.report_reject_message(id, peer_id); return Ok(()); } + + fail_point!("receive_raft_message_from_outside"); ch.feed(msg, false); Ok(()) } + + fn get_store_id_from_metadata(ctx: &RpcContext<'_>) -> Option { + let metadata = ctx.request_headers(); + for i in 0..metadata.len() { + let (key, value) = metadata.get(i).unwrap(); + if key == MetadataSourceStoreId::KEY { + let store_id = MetadataSourceStoreId::parse(value); + return Some(store_id); + } + } + None + } } macro_rules! handle_request { @@ -636,6 +650,14 @@ impl Tikv for Service { stream: RequestStream, sink: ClientStreamingSink, ) { + let source_store_id = Self::get_store_id_from_metadata(&ctx); + let message_received = + source_store_id.map(|x| MESSAGE_RECV_BY_STORE.with_label_values(&[&format!("{}", x)])); + info!( + "raft RPC is called, new gRPC stream established"; + "source_store_id" => ?source_store_id, + ); + let store_id = self.store_id; let ch = self.storage.get_engine().raft_extension(); let reject_messages_on_memory_ratio = self.reject_messages_on_memory_ratio; @@ -652,6 +674,9 @@ impl Tikv for Service { // `StoreNotMatch` to let tikv to resolve a correct address from PD return Err(Error::from(err)); } + if let Some(ref counter) = message_received { + counter.inc(); + } } Ok::<(), Error>(()) }; @@ -678,7 +703,14 @@ impl Tikv for Service { stream: RequestStream, sink: ClientStreamingSink, ) { - info!("batch_raft RPC is called, new gRPC stream established"); + let source_store_id = Self::get_store_id_from_metadata(&ctx); + let message_received = + source_store_id.map(|x| MESSAGE_RECV_BY_STORE.with_label_values(&[&format!("{}", x)])); + info!( + "batch_raft RPC is called, new gRPC stream established"; + "source_store_id" => ?source_store_id, + ); + let store_id = self.store_id; let ch = self.storage.get_engine().raft_extension(); let reject_messages_on_memory_ratio = self.reject_messages_on_memory_ratio; @@ -699,6 +731,9 @@ impl Tikv for Service { return Err(Error::from(err)); } } + if let Some(ref counter) = message_received { + counter.inc_by(len as u64); + } } Ok::<(), Error>(()) }; diff --git a/src/server/status_server/mod.rs b/src/server/status_server/mod.rs index 2ce7a8714c0..1b689138f11 100644 --- a/src/server/status_server/mod.rs +++ b/src/server/status_server/mod.rs @@ -411,6 +411,16 @@ where } } + async fn get_engine_type(cfg_controller: &ConfigController) -> hyper::Result> { + let engine_type = cfg_controller.get_engine_type(); + let response = Response::builder() + .header("Content-Type", mime::TEXT_PLAIN.to_string()) + .header("Content-Length", engine_type.len()) + .body(engine_type.into()) + .unwrap(); + Ok(response) + } + pub fn stop(self) { let _ = self.tx.send(()); self.thread_pool.shutdown_timeout(Duration::from_secs(3)); @@ -609,6 +619,9 @@ where (Method::POST, "/config") => { Self::update_config(cfg_controller.clone(), req).await } + (Method::GET, "/engine_type") => { + Self::get_engine_type(&cfg_controller).await + } // This interface is used for configuration file hosting scenarios, // TiKV will not update configuration files, and this interface will // silently ignore configration items that cannot be updated online, @@ -1024,6 +1037,7 @@ mod tests { use crate::{ config::{ConfigController, TikvConfig}, server::status_server::{profile::TEST_PROFILE_MUTEX, LogLevelRequest, StatusServer}, + storage::config::EngineType, }; #[derive(Clone)] @@ -1573,4 +1587,43 @@ mod tests { block_on(handle).unwrap(); status_server.stop(); } + + #[test] + fn test_get_engine_type() { + let mut multi_rocks_cfg = TikvConfig::default(); + multi_rocks_cfg.storage.engine = EngineType::RaftKv2; + let cfgs = [TikvConfig::default(), multi_rocks_cfg]; + let resp_strs = ["raft-kv", "partitioned-raft-kv"]; + for (cfg, resp_str) in IntoIterator::into_iter(cfgs).zip(resp_strs) { + let temp_dir = tempfile::TempDir::new().unwrap(); + let mut status_server = StatusServer::new( + 1, + ConfigController::new(cfg), + Arc::new(SecurityConfig::default()), + MockRouter, + temp_dir.path().to_path_buf(), + None, + ) + .unwrap(); + let addr = "127.0.0.1:0".to_owned(); + let _ = status_server.start(addr); + let client = Client::new(); + let uri = Uri::builder() + .scheme("http") + .authority(status_server.listening_addr().to_string().as_str()) + .path_and_query("/engine_type") + .build() + .unwrap(); + + let handle = status_server.thread_pool.spawn(async move { + let res = client.get(uri).await.unwrap(); + assert_eq!(res.status(), StatusCode::OK); + let body_bytes = hyper::body::to_bytes(res.into_body()).await.unwrap(); + let engine_type = String::from_utf8(body_bytes.as_ref().to_owned()).unwrap(); + assert_eq!(engine_type, resp_str); + }); + block_on(handle).unwrap(); + status_server.stop(); + } + } } diff --git a/src/server/tablet_snap.rs b/src/server/tablet_snap.rs index 63905d7825b..cb7ec7c988a 100644 --- a/src/server/tablet_snap.rs +++ b/src/server/tablet_snap.rs @@ -25,10 +25,7 @@ use std::{ fs::{self, File}, io::{BorrowedBuf, Read, Seek, SeekFrom, Write}, path::Path, - sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, - }, + sync::{atomic::Ordering, Arc}, time::Duration, }; @@ -358,6 +355,9 @@ pub(crate) async fn accept_missing( } // Now receive other files. loop { + fail_point!("receiving_snapshot_net_error", |_| { + Err(box_err!("failed to receive snapshot")) + }); let chunk = match stream.next().await { Some(Ok(mut req)) if req.has_chunk() => req.take_chunk(), Some(Ok(req)) if req.has_end() => { @@ -707,8 +707,6 @@ pub struct TabletRunner { raft_router: R, cfg_tracker: Tracker, cfg: Config, - sending_count: Arc, - recving_count: Arc, cache_builder: B, limiter: Limiter, } @@ -746,8 +744,6 @@ impl TabletRunner { security_mgr, cfg_tracker, cfg: config, - sending_count: Arc::new(AtomicUsize::new(0)), - recving_count: Arc::new(AtomicUsize::new(0)), cache_builder, limiter, }; @@ -792,7 +788,8 @@ where self.pool.spawn(sink.fail(status).map(|_| ())); } Task::RecvTablet { stream, sink } => { - let task_num = self.recving_count.load(Ordering::SeqCst); + let recving_count = self.snap_mgr.recving_count().clone(); + let task_num = recving_count.load(Ordering::SeqCst); if task_num >= self.cfg.concurrent_recv_snap_limit { warn!("too many recving snapshot tasks, ignore"); let status = RpcStatus::with_message( @@ -809,7 +806,6 @@ where let snap_mgr = self.snap_mgr.clone(); let raft_router = self.raft_router.clone(); - let recving_count = self.recving_count.clone(); recving_count.fetch_add(1, Ordering::SeqCst); let limiter = self.limiter.clone(); let cache_builder = self.cache_builder.clone(); @@ -833,8 +829,8 @@ where } Task::Send { addr, msg, cb } => { let region_id = msg.get_region_id(); - if self.sending_count.load(Ordering::SeqCst) >= self.cfg.concurrent_send_snap_limit - { + let sending_count = self.snap_mgr.sending_count().clone(); + if sending_count.load(Ordering::SeqCst) >= self.cfg.concurrent_send_snap_limit { let key = TabletSnapKey::from_region_snap( msg.get_region_id(), msg.get_to_peer().get_id(), @@ -853,7 +849,6 @@ where let env = Arc::clone(&self.env); let mgr = self.snap_mgr.clone(); let security_mgr = Arc::clone(&self.security_mgr); - let sending_count = Arc::clone(&self.sending_count); sending_count.fetch_add(1, Ordering::SeqCst); let limiter = self.limiter.clone(); let send_task = send_snap( diff --git a/src/storage/errors.rs b/src/storage/errors.rs index 92568d22e45..07ea4b5589e 100644 --- a/src/storage/errors.rs +++ b/src/storage/errors.rs @@ -424,6 +424,13 @@ pub fn extract_key_error(err: &Error) -> kvrpcpb::KeyError { assertion_failed.set_existing_commit_ts(existing_commit_ts.into_inner()); key_error.set_assertion_failed(assertion_failed); } + Error(box ErrorInner::Txn(TxnError(box TxnErrorInner::Mvcc(MvccError( + box MvccErrorInner::PrimaryMismatch(lock_info), + ))))) => { + let mut primary_mismatch = kvrpcpb::PrimaryMismatch::default(); + primary_mismatch.set_lock_info(lock_info.clone()); + key_error.set_primary_mismatch(primary_mismatch); + } _ => { error!(?*err; "txn aborts"); key_error.set_abort(format!("{:?}", err)); diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 37263ce9a12..897968ef671 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -7910,6 +7910,7 @@ mod tests { false, false, false, + true, Context::default(), ), expect_fail_callback(tx.clone(), 0, |e| match e { @@ -7936,6 +7937,7 @@ mod tests { true, false, false, + true, Context::default(), ), expect_value_callback(tx.clone(), 0, LockNotExist), @@ -7993,6 +7995,7 @@ mod tests { true, false, false, + true, Context::default(), ), expect_value_callback( @@ -8038,6 +8041,7 @@ mod tests { true, false, false, + true, Context::default(), ), expect_value_callback(tx.clone(), 0, committed(ts(20, 0))), @@ -8049,7 +8053,7 @@ mod tests { .sched_txn_command( commands::Prewrite::with_lock_ttl( vec![Mutation::make_put(k.clone(), v)], - k.as_encoded().to_vec(), + k.to_raw().unwrap(), ts(25, 0), 100, ), @@ -8069,6 +8073,7 @@ mod tests { true, false, false, + true, Context::default(), ), expect_value_callback(tx.clone(), 0, TtlExpire), @@ -9411,6 +9416,7 @@ mod tests { false, false, false, + true, Context::default(), ), expect_value_callback( @@ -9447,6 +9453,7 @@ mod tests { false, false, false, + true, Context::default(), ), expect_value_callback(tx.clone(), 0, TxnStatus::TtlExpire), @@ -9840,6 +9847,7 @@ mod tests { true, false, false, + true, Default::default(), ), expect_ok_callback(tx.clone(), 0), diff --git a/src/storage/mvcc/consistency_check.rs b/src/storage/mvcc/consistency_check.rs index 487ae61d5e8..311447601f8 100644 --- a/src/storage/mvcc/consistency_check.rs +++ b/src/storage/mvcc/consistency_check.rs @@ -175,7 +175,7 @@ impl MvccInfoScanner { }) } - fn next_item(&mut self) -> Result> { + pub fn next_item(&mut self) -> Result> { let mut lock_ok = box_try!(self.lock_iter.valid()); let mut writes_ok = box_try!(self.write_iter.valid()); @@ -221,7 +221,7 @@ impl MvccInfoScanner { } #[derive(Clone, Default)] -struct MvccInfoCollector { +pub struct MvccInfoCollector { current_item: Vec, mvcc_info: MvccInfo, } diff --git a/src/storage/mvcc/mod.rs b/src/storage/mvcc/mod.rs index 0f133b99941..1779c116ccd 100644 --- a/src/storage/mvcc/mod.rs +++ b/src/storage/mvcc/mod.rs @@ -20,7 +20,9 @@ pub use txn_types::{ }; pub use self::{ - consistency_check::{Mvcc as MvccConsistencyCheckObserver, MvccInfoIterator}, + consistency_check::{ + Mvcc as MvccConsistencyCheckObserver, MvccInfoCollector, MvccInfoIterator, MvccInfoScanner, + }, metrics::{GC_DELETE_VERSIONS_HISTOGRAM, MVCC_VERSIONS_HISTOGRAM}, reader::*, txn::{GcInfo, MvccTxn, ReleasedLock, MAX_TXN_WRITE_SIZE}, @@ -169,6 +171,9 @@ pub enum ErrorInner { )] LockIfExistsFailed { start_ts: TimeStamp, key: Vec }, + #[error("check_txn_status sent to secondary lock, current lock: {0:?}")] + PrimaryMismatch(kvproto::kvrpcpb::LockInfo), + #[error("{0:?}")] Other(#[from] Box), } @@ -298,6 +303,7 @@ impl ErrorInner { key: key.clone(), }) } + ErrorInner::PrimaryMismatch(l) => Some(ErrorInner::PrimaryMismatch(l.clone())), ErrorInner::Io(_) | ErrorInner::Other(_) => None, } } @@ -400,6 +406,7 @@ impl ErrorCodeExt for Error { ErrorInner::CommitTsTooLarge { .. } => error_code::storage::COMMIT_TS_TOO_LARGE, ErrorInner::AssertionFailed { .. } => error_code::storage::ASSERTION_FAILED, ErrorInner::LockIfExistsFailed { .. } => error_code::storage::LOCK_IF_EXISTS_FAILED, + ErrorInner::PrimaryMismatch(_) => error_code::storage::PRIMARY_MISMATCH, ErrorInner::Other(_) => error_code::storage::UNKNOWN, } } diff --git a/src/storage/txn/actions/check_txn_status.rs b/src/storage/txn/actions/check_txn_status.rs index a3cd3253201..b0e1ff66232 100644 --- a/src/storage/txn/actions/check_txn_status.rs +++ b/src/storage/txn/actions/check_txn_status.rs @@ -24,7 +24,15 @@ pub fn check_txn_status_lock_exists( caller_start_ts: TimeStamp, force_sync_commit: bool, resolving_pessimistic_lock: bool, + verify_is_primary: bool, ) -> Result<(TxnStatus, Option)> { + if verify_is_primary && !primary_key.is_encoded_from(&lock.primary) { + // Return the current lock info to tell the client what the actual primary is. + return Err( + ErrorInner::PrimaryMismatch(lock.into_lock_info(primary_key.into_raw()?)).into(), + ); + } + // Never rollback or push forward min_commit_ts in check_txn_status if it's // using async commit. Rollback of async-commit locks are done during // ResolveLock. diff --git a/src/storage/txn/commands/check_txn_status.rs b/src/storage/txn/commands/check_txn_status.rs index 895c753b160..e915c0357d4 100644 --- a/src/storage/txn/commands/check_txn_status.rs +++ b/src/storage/txn/commands/check_txn_status.rs @@ -51,6 +51,11 @@ command! { // lock, the transaction status could not be decided if the primary lock is pessimistic too and // it's still uncertain. resolving_pessimistic_lock: bool, + // Whether it's needed to check wheter the lock on the key (if any) is the primary lock. + // This is for handling some corner cases when pessimistic transactions changes its primary + // (see https://github.com/pingcap/tidb/issues/42937 for details). + // Must be set to true, unless the client is old version that doesn't support this behavior. + verify_is_primary: bool, } } @@ -107,6 +112,7 @@ impl WriteCommand for CheckTxnStatus { self.caller_start_ts, self.force_sync_commit, self.resolving_pessimistic_lock, + self.verify_is_primary, )?, l => ( check_txn_status_missing_lock( @@ -145,7 +151,7 @@ impl WriteCommand for CheckTxnStatus { #[cfg(test)] pub mod tests { use concurrency_manager::ConcurrencyManager; - use kvproto::kvrpcpb::{Context, PrewriteRequestPessimisticAction::*}; + use kvproto::kvrpcpb::{self, Context, LockInfo, PrewriteRequestPessimisticAction::*}; use tikv_util::deadline::Deadline; use txn_types::{Key, WriteType}; @@ -153,8 +159,10 @@ pub mod tests { use crate::storage::{ kv::Engine, lock_manager::MockLockManager, + mvcc, mvcc::tests::*, txn::{ + self, commands::{pessimistic_rollback, WriteCommand, WriteContext}, scheduler::DEFAULT_EXECUTION_DURATION_LIMIT, tests::*, @@ -188,6 +196,7 @@ pub mod tests { rollback_if_not_exist, force_sync_commit, resolving_pessimistic_lock, + verify_is_primary: true, deadline: Deadline::from_now(DEFAULT_EXECUTION_DURATION_LIMIT), }; let result = command @@ -220,7 +229,7 @@ pub mod tests { rollback_if_not_exist: bool, force_sync_commit: bool, resolving_pessimistic_lock: bool, - ) { + ) -> txn::Error { let ctx = Context::default(); let snapshot = engine.snapshot(Default::default()).unwrap(); let current_ts = current_ts.into(); @@ -235,23 +244,28 @@ pub mod tests { rollback_if_not_exist, force_sync_commit, resolving_pessimistic_lock, + verify_is_primary: true, deadline: Deadline::from_now(DEFAULT_EXECUTION_DURATION_LIMIT), }; - assert!( - command - .process_write( - snapshot, - WriteContext { - lock_mgr: &MockLockManager::new(), - concurrency_manager: cm, - extra_op: Default::default(), - statistics: &mut Default::default(), - async_apply_prewrite: false, - raw_ext: None, - }, + command + .process_write( + snapshot, + WriteContext { + lock_mgr: &MockLockManager::new(), + concurrency_manager: cm, + extra_op: Default::default(), + statistics: &mut Default::default(), + async_apply_prewrite: false, + raw_ext: None, + }, + ) + .map(|r| { + panic!( + "expected check_txn_status fail but succeeded with result: {:?}", + r.pr ) - .is_err() - ); + }) + .unwrap_err() } fn committed(commit_ts: impl Into) -> impl FnOnce(TxnStatus) -> bool { @@ -1188,4 +1202,46 @@ pub mod tests { assert!(rollback.last_change_ts.is_zero()); assert_eq!(rollback.versions_to_last_change, 0); } + + #[test] + fn test_verify_is_primary() { + let mut engine = TestEngineBuilder::new().build().unwrap(); + + let check_lock = |l: LockInfo, key: &'_ [u8], primary: &'_ [u8], lock_type| { + assert_eq!(&l.key, key); + assert_eq!(l.lock_type, lock_type); + assert_eq!(&l.primary_lock, primary); + }; + + let check_error = |e, key: &'_ [u8], primary: &'_ [u8], lock_type| match e { + txn::Error(box txn::ErrorInner::Mvcc(mvcc::Error( + box mvcc::ErrorInner::PrimaryMismatch(lock_info), + ))) => { + check_lock(lock_info, key, primary, lock_type); + } + e => panic!("unexpected error: {:?}", e), + }; + + must_acquire_pessimistic_lock(&mut engine, b"k1", b"k2", 1, 1); + let e = must_err(&mut engine, b"k1", 1, 1, 0, true, false, true); + check_error(e, b"k1", b"k2", kvrpcpb::Op::PessimisticLock); + let lock = must_pessimistic_locked(&mut engine, b"k1", 1, 1); + check_lock( + lock.into_lock_info(b"k1".to_vec()), + b"k1", + b"k2", + kvrpcpb::Op::PessimisticLock, + ); + + must_pessimistic_prewrite_put(&mut engine, b"k1", b"v1", b"k2", 1, 1, DoPessimisticCheck); + let e = must_err(&mut engine, b"k1", 1, 1, 0, true, false, true); + check_error(e, b"k1", b"k2", kvrpcpb::Op::Put); + let lock = must_locked(&mut engine, b"k1", 1); + check_lock( + lock.into_lock_info(b"k1".to_vec()), + b"k1", + b"k2", + kvrpcpb::Op::Put, + ); + } } diff --git a/src/storage/txn/commands/mod.rs b/src/storage/txn/commands/mod.rs index 4c01629ef48..5e484d385f2 100644 --- a/src/storage/txn/commands/mod.rs +++ b/src/storage/txn/commands/mod.rs @@ -306,6 +306,7 @@ impl From for TypedCommand { req.get_rollback_if_not_exist(), req.get_force_sync_commit(), req.get_resolving_pessimistic_lock(), + req.get_verify_is_primary(), req.take_context(), ) } diff --git a/tests/failpoints/cases/test_cmd_epoch_checker.rs b/tests/failpoints/cases/test_cmd_epoch_checker.rs index feaa1af76ef..73bc741d9bb 100644 --- a/tests/failpoints/cases/test_cmd_epoch_checker.rs +++ b/tests/failpoints/cases/test_cmd_epoch_checker.rs @@ -10,23 +10,26 @@ use kvproto::raft_cmdpb::{RaftCmdRequest, RaftCmdResponse}; use raft::eraftpb::MessageType; use raftstore::store::msg::*; use test_raftstore::*; -use tikv_util::HandyRwLock; +use tikv_util::{mpsc::future, HandyRwLock}; struct CbReceivers { proposed: mpsc::Receiver<()>, committed: mpsc::Receiver<()>, - applied: mpsc::Receiver, + applied: future::Receiver, } impl CbReceivers { - fn assert_not_ready(&self) { + fn assert_not_ready(&mut self) { sleep_ms(100); assert_eq!(self.proposed.try_recv().unwrap_err(), TryRecvError::Empty); assert_eq!(self.committed.try_recv().unwrap_err(), TryRecvError::Empty); - assert_eq!(self.applied.try_recv().unwrap_err(), TryRecvError::Empty); + assert_eq!( + self.applied.try_recv().unwrap_err(), + crossbeam::channel::TryRecvError::Empty + ); } - fn assert_ok(&self) { + fn assert_ok(&mut self) { self.assert_applied_ok(); // proposed and committed should be invoked before applied self.proposed.try_recv().unwrap(); @@ -34,14 +37,14 @@ impl CbReceivers { } // When fails to propose, only applied callback will be invoked. - fn assert_err(&self) { + fn assert_err(&mut self) { let resp = self.applied.recv_timeout(Duration::from_secs(1)).unwrap(); assert!(resp.get_header().has_error(), "{:?}", resp); self.proposed.try_recv().unwrap_err(); self.committed.try_recv().unwrap_err(); } - fn assert_applied_ok(&self) { + fn assert_applied_ok(&mut self) { let resp = self.applied.recv_timeout(Duration::from_secs(1)).unwrap(); assert!( !resp.get_header().has_error(), @@ -119,7 +122,7 @@ fn test_reject_proposal_during_region_split() { fail::cfg(force_delay_propose_batch_raft_command_fp, "2*return").unwrap(); } let write_req = make_write_req(&mut cluster, b"k1"); - let (cb, cb_receivers) = make_cb(&write_req); + let (cb, mut cb_receivers) = make_cb(&write_req); cluster .sim .rl() @@ -141,13 +144,13 @@ fn test_reject_proposal_during_region_split() { ); // The write request fails due to epoch not match. - for r in receivers { + for mut r in receivers { r.assert_err(); } // New write request can succeed. let write_req = make_write_req(&mut cluster, b"k1"); - let (cb, cb_receivers) = make_cb(&write_req); + let (cb, mut cb_receivers) = make_cb(&write_req); cluster .sim .rl() @@ -198,7 +201,7 @@ fn test_reject_proposal_during_region_merge() { fail::cfg(force_delay_propose_batch_raft_command_fp, "2*return").unwrap(); } let write_req = make_write_req(&mut cluster, b"a"); - let (cb, cb_receivers) = make_cb(&write_req); + let (cb, mut cb_receivers) = make_cb(&write_req); cluster .sim .rl() @@ -222,7 +225,7 @@ fn test_reject_proposal_during_region_merge() { .has_error() ); // The write request fails due to epoch not match. - for r in receivers { + for mut r in receivers { r.assert_err(); } @@ -234,7 +237,7 @@ fn test_reject_proposal_during_region_merge() { fail::cfg(force_delay_propose_batch_raft_command_fp, "2*return").unwrap(); } let write_req = make_write_req(&mut cluster, b"a"); - let (cb, cb_receivers) = make_cb(&write_req); + let (cb, mut cb_receivers) = make_cb(&write_req); cluster .sim .rl() @@ -251,7 +254,7 @@ fn test_reject_proposal_during_region_merge() { fail::cfg(force_delay_propose_batch_raft_command_fp, "2*return").unwrap(); } let write_req = make_write_req(&mut cluster, b"k"); - let (cb, cb_receivers) = make_cb(&write_req); + let (cb, mut cb_receivers) = make_cb(&write_req); cluster .sim .rl() @@ -266,13 +269,13 @@ fn test_reject_proposal_during_region_merge() { fail::remove(commit_merge_fp); pd_client.check_merged_timeout(source.get_id(), Duration::from_secs(5)); // The write request fails due to epoch not match. - for r in receivers { + for mut r in receivers { r.assert_err(); } // New write request can succeed. let write_req = make_write_req(&mut cluster, b"k"); - let (cb, cb_receivers) = make_cb(&write_req); + let (cb, mut cb_receivers) = make_cb(&write_req); cluster .sim .rl() @@ -321,7 +324,7 @@ fn test_reject_proposal_during_rollback_region_merge() { fail::cfg(force_delay_propose_batch_raft_command_fp, "2*return").unwrap(); } let write_req = make_write_req(&mut cluster, b"a"); - let (cb, cb_receivers) = make_cb(&write_req); + let (cb, mut cb_receivers) = make_cb(&write_req); cluster .sim .rl() @@ -336,7 +339,7 @@ fn test_reject_proposal_during_rollback_region_merge() { // New write request can succeed. let write_req = make_write_req(&mut cluster, b"a"); - let (cb, cb_receivers) = make_cb(&write_req); + let (cb, mut cb_receivers) = make_cb(&write_req); cluster .sim .rl() @@ -374,7 +377,7 @@ fn test_reject_proposal_during_leader_transfer() { fail::cfg(force_delay_propose_batch_raft_command_fp, "2*return").unwrap(); } let write_req = make_write_req(&mut cluster, b"k"); - let (cb, cb_receivers) = make_cb(&write_req); + let (cb, mut cb_receivers) = make_cb(&write_req); cluster .sim .rl() @@ -395,14 +398,14 @@ fn test_accept_proposal_during_conf_change() { let conf_change_fp = "apply_on_conf_change_all_1"; fail::cfg(conf_change_fp, "pause").unwrap(); - let add_peer_rx = cluster.async_add_peer(r, new_peer(2, 2)).unwrap(); + let mut add_peer_rx = cluster.async_add_peer(r, new_peer(2, 2)).unwrap(); add_peer_rx .recv_timeout(Duration::from_millis(100)) .unwrap_err(); // Conf change doesn't affect proposals. let write_req = make_write_req(&mut cluster, b"k"); - let (cb, cb_receivers) = make_cb(&write_req); + let (cb, mut cb_receivers) = make_cb(&write_req); cluster .sim .rl() @@ -445,7 +448,7 @@ fn test_not_invoke_committed_cb_when_fail_to_commit() { // proposal. cluster.partition(vec![1], vec![2, 3]); let write_req = make_write_req(&mut cluster, b"k1"); - let (cb, cb_receivers) = make_cb(&write_req); + let (cb, mut cb_receivers) = make_cb(&write_req); cluster .sim .rl() @@ -489,7 +492,7 @@ fn test_propose_before_transfer_leader() { fail::cfg(force_delay_propose_batch_raft_command_fp, "return").unwrap(); let write_req = make_write_req(&mut cluster, b"k1"); - let (cb, cb_receivers) = make_cb(&write_req); + let (cb, mut cb_receivers) = make_cb(&write_req); cluster .sim .rl() @@ -518,7 +521,7 @@ fn test_propose_before_split_and_merge() { fail::cfg(force_delay_propose_batch_raft_command_fp, "return").unwrap(); let write_req = make_write_req(&mut cluster, b"k1"); - let (cb, cb_receivers) = make_cb(&write_req); + let (cb, mut cb_receivers) = make_cb(&write_req); cluster .sim .rl() @@ -542,7 +545,7 @@ fn test_propose_before_split_and_merge() { cluster.must_transfer_leader(right.get_id(), right_peer2); let write_req = make_write_req(&mut cluster, b"k0"); - let (cb, cb_receivers) = make_cb(&write_req); + let (cb, mut cb_receivers) = make_cb(&write_req); cluster .sim .rl() @@ -552,7 +555,7 @@ fn test_propose_before_split_and_merge() { cb_receivers.assert_proposed_ok(); let write_req2 = make_write_req(&mut cluster, b"k2"); - let (cb2, cb_receivers2) = make_cb(&write_req2); + let (cb2, mut cb_receivers2) = make_cb(&write_req2); cluster .sim .rl() diff --git a/tests/failpoints/cases/test_conf_change.rs b/tests/failpoints/cases/test_conf_change.rs index 0a1be37cab6..c3612e64127 100644 --- a/tests/failpoints/cases/test_conf_change.rs +++ b/tests/failpoints/cases/test_conf_change.rs @@ -110,7 +110,7 @@ fn test_write_after_destroy() { let mut epoch = cluster.pd_client.get_region_epoch(r1); let mut admin_req = new_admin_request(r1, &epoch, conf_change); admin_req.mut_header().set_peer(new_peer(1, 1)); - let (cb1, rx1) = make_cb(&admin_req); + let (cb1, mut rx1) = make_cb(&admin_req); let engines_3 = cluster.get_all_engines(3); let region = block_on(cluster.pd_client.get_region_by_id(r1)) .unwrap() diff --git a/tests/failpoints/cases/test_coprocessor.rs b/tests/failpoints/cases/test_coprocessor.rs index d7f6540a3c6..d397d602d84 100644 --- a/tests/failpoints/cases/test_coprocessor.rs +++ b/tests/failpoints/cases/test_coprocessor.rs @@ -11,7 +11,7 @@ use kvproto::{ use more_asserts::{assert_ge, assert_le}; use protobuf::Message; use test_coprocessor::*; -use test_raftstore::{must_get_equal, new_peer, new_server_cluster}; +use test_raftstore_macro::test_case; use test_storage::*; use tidb_query_datatype::{ codec::{datum, Datum}, @@ -361,14 +361,14 @@ fn test_paging_scan_multi_ranges() { } } -#[test] +// TODO: #[test_case(test_raftstore_v2::must_new_cluster_and_kv_client_mul)] +#[test_case(test_raftstore::must_new_cluster_and_kv_client_mul)] fn test_read_index_lock_checking_on_follower() { - let mut cluster = new_server_cluster(0, 2); - + let (mut cluster, _client, _ctx) = new_cluster(2); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); - let rid = cluster.run_conf_change(); + let rid = 1; cluster.must_put(b"k1", b"v1"); pd_client.must_add_peer(rid, new_peer(2, 2)); must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); diff --git a/tests/failpoints/cases/test_disk_full.rs b/tests/failpoints/cases/test_disk_full.rs index f1b135ef86a..fc28560c7f1 100644 --- a/tests/failpoints/cases/test_disk_full.rs +++ b/tests/failpoints/cases/test_disk_full.rs @@ -11,7 +11,7 @@ use kvproto::{ use raft::eraftpb::MessageType; use raftstore::store::msg::*; use test_raftstore::*; -use tikv_util::{config::ReadableDuration, time::Instant}; +use tikv_util::{config::ReadableDuration, future::block_on_timeout, time::Instant}; fn assert_disk_full(resp: &RaftCmdResponse) { assert!(resp.get_header().get_error().has_disk_full()); @@ -67,7 +67,7 @@ fn ensure_disk_usage_is_reported( let peer = new_peer(store_id, peer_id); let key = region.get_start_key(); let ch = async_read_on_peer(cluster, peer, region.clone(), key, true, true); - ch.recv_timeout(Duration::from_secs(1)).unwrap(); + block_on_timeout(ch, Duration::from_secs(1)).unwrap(); } fn test_disk_full_leader_behaviors(usage: DiskUsage) { @@ -86,7 +86,7 @@ fn test_disk_full_leader_behaviors(usage: DiskUsage) { // Test new normal proposals won't be allowed when disk is full. let old_last_index = cluster.raft_local_state(1, 1).last_index; - let rx = cluster.async_put(b"k2", b"v2").unwrap(); + let mut rx = cluster.async_put(b"k2", b"v2").unwrap(); assert_disk_full(&rx.recv_timeout(Duration::from_secs(2)).unwrap()); let new_last_index = cluster.raft_local_state(1, 1).last_index; assert_eq!(old_last_index, new_last_index); @@ -299,7 +299,7 @@ fn test_majority_disk_full() { } // Normal proposals will be rejected because of majority peers' disk full. - let ch = cluster.async_put(b"k2", b"v2").unwrap(); + let mut ch = cluster.async_put(b"k2", b"v2").unwrap(); let resp = ch.recv_timeout(Duration::from_secs(1)).unwrap(); assert_eq!(disk_full_stores(&resp), vec![2, 3]); @@ -310,7 +310,7 @@ fn test_majority_disk_full() { let put = new_request(1, epoch.clone(), reqs, false); let mut opts = RaftCmdExtraOpts::default(); opts.disk_full_opt = DiskFullOpt::AllowedOnAlmostFull; - let ch = cluster.async_request_with_opts(put, opts).unwrap(); + let mut ch = cluster.async_request_with_opts(put, opts).unwrap(); let resp = ch.recv_timeout(Duration::from_secs(1)).unwrap(); assert!(!resp.get_header().has_error()); @@ -335,7 +335,7 @@ fn test_majority_disk_full() { let put = new_request(1, epoch.clone(), reqs, false); let mut opts = RaftCmdExtraOpts::default(); opts.disk_full_opt = DiskFullOpt::AllowedOnAlmostFull; - let ch = cluster.async_request_with_opts(put, opts).unwrap(); + let mut ch = cluster.async_request_with_opts(put, opts).unwrap(); let resp = ch.recv_timeout(Duration::from_secs(10)).unwrap(); assert_eq!(disk_full_stores(&resp), vec![2, 3]); @@ -354,7 +354,7 @@ fn test_majority_disk_full() { let put = new_request(1, epoch, reqs, false); let mut opts = RaftCmdExtraOpts::default(); opts.disk_full_opt = DiskFullOpt::AllowedOnAlmostFull; - let ch = cluster.async_request_with_opts(put, opts).unwrap(); + let mut ch = cluster.async_request_with_opts(put, opts).unwrap(); let resp = ch.recv_timeout(Duration::from_secs(1)).unwrap(); assert_eq!(disk_full_stores(&resp), vec![3]); diff --git a/tests/failpoints/cases/test_hibernate.rs b/tests/failpoints/cases/test_hibernate.rs index 6bbed4ac641..d8670d9a21f 100644 --- a/tests/failpoints/cases/test_hibernate.rs +++ b/tests/failpoints/cases/test_hibernate.rs @@ -23,7 +23,7 @@ fn test_break_leadership_on_restart() { // stable. cluster.cfg.raft_store.raft_min_election_timeout_ticks = 10; cluster.cfg.raft_store.raft_max_election_timeout_ticks = 11; - configure_for_hibernate(&mut cluster); + configure_for_hibernate(&mut cluster.cfg); cluster.pd_client.disable_default_operator(); let r = cluster.run_conf_change(); cluster.pd_client.must_add_peer(r, new_peer(2, 2)); @@ -81,3 +81,54 @@ fn test_break_leadership_on_restart() { // incorrectly. rx.recv_timeout(Duration::from_secs(2)).unwrap_err(); } + +// This case creates a cluster with 3 TiKV instances, and then wait all peers +// hibernate. +// +// After that, propose a command and stop the leader node immediately. +// With failpoint `receive_raft_message_from_outside`, we can make the proposal +// reach 2 followers *after* `StoreUnreachable` is broadcasted. +// +// 2 followers may become GroupState::Chaos after `StoreUnreachable` is +// received, and become `GroupState::Ordered` after the proposal is received. +// But they should keep wakeful for a while. +#[test] +fn test_store_disconnect_with_hibernate() { + let mut cluster = new_server_cluster(0, 3); + let base_tick_ms = 50; + cluster.cfg.raft_store.raft_base_tick_interval = ReadableDuration::millis(base_tick_ms); + cluster.cfg.raft_store.raft_heartbeat_ticks = 2; + cluster.cfg.raft_store.raft_election_timeout_ticks = 10; + cluster.cfg.raft_store.unreachable_backoff = ReadableDuration::millis(500); + cluster.cfg.server.raft_client_max_backoff = ReadableDuration::millis(200); + // So the random election timeout will always be 10, which makes the case more + // stable. + cluster.cfg.raft_store.raft_min_election_timeout_ticks = 10; + cluster.cfg.raft_store.raft_max_election_timeout_ticks = 11; + configure_for_hibernate(&mut cluster.cfg); + cluster.pd_client.disable_default_operator(); + let r = cluster.run_conf_change(); + cluster.pd_client.must_add_peer(r, new_peer(2, 2)); + cluster.pd_client.must_add_peer(r, new_peer(3, 3)); + + cluster.must_put(b"k1", b"v1"); + must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); + must_get_equal(&cluster.get_engine(3), b"k1", b"v1"); + + // Wait until all peers of region 1 hibernate. + thread::sleep(Duration::from_millis(base_tick_ms * 30)); + + // Stop the region leader. + fail::cfg("receive_raft_message_from_outside", "pause").unwrap(); + let _ = cluster.async_put(b"k2", b"v2").unwrap(); + cluster.stop_node(1); + + // Wait for a while so that the failpoint can be triggered on followers. + thread::sleep(Duration::from_millis(100)); + fail::remove("receive_raft_message_from_outside"); + + // Wait for a while. Peers of region 1 shouldn't hibernate. + thread::sleep(Duration::from_millis(base_tick_ms * 30)); + must_get_equal(&cluster.get_engine(2), b"k2", b"v2"); + must_get_equal(&cluster.get_engine(3), b"k2", b"v2"); +} diff --git a/tests/failpoints/cases/test_import_service.rs b/tests/failpoints/cases/test_import_service.rs index 3fdb464c718..475acbe9f3c 100644 --- a/tests/failpoints/cases/test_import_service.rs +++ b/tests/failpoints/cases/test_import_service.rs @@ -19,6 +19,7 @@ use tikv_util::HandyRwLock; mod util; use self::util::{ check_ingested_kvs, new_cluster_and_tikv_import_client, new_cluster_and_tikv_import_client_tde, + open_cluster_and_tikv_import_client_v2, send_upload_sst, }; // Opening sst writer involves IO operation, it may block threads for a while. @@ -248,3 +249,47 @@ fn test_ingest_file_twice_and_conflict() { resp.get_error().get_message() ); } + +#[test] +fn test_ingest_sst_v2() { + let mut cluster = test_raftstore_v2::new_server_cluster(1, 1); + let (ctx, _tikv, import) = open_cluster_and_tikv_import_client_v2(None, &mut cluster); + let temp_dir = Builder::new().prefix("test_ingest_sst").tempdir().unwrap(); + let sst_path = temp_dir.path().join("test.sst"); + let sst_range = (0, 100); + let (mut meta, data) = gen_sst_file(sst_path, sst_range); + + // No region id and epoch. + send_upload_sst(&import, &meta, &data).unwrap(); + let mut ingest = IngestRequest::default(); + ingest.set_context(ctx.clone()); + ingest.set_sst(meta.clone()); + meta.set_region_id(ctx.get_region_id()); + meta.set_region_epoch(ctx.get_region_epoch().clone()); + send_upload_sst(&import, &meta, &data).unwrap(); + ingest.set_sst(meta); + let resp = import.ingest(&ingest).unwrap(); + assert!(!resp.has_error(), "{:?}", resp.get_error()); + fail::cfg("on_cleanup_import_sst", "return").unwrap(); + let (tx, rx) = channel::<()>(); + let tx = Arc::new(Mutex::new(tx)); + fail::cfg_callback("on_cleanup_import_sst_schedule", move || { + tx.lock().unwrap().send(()).unwrap(); + }) + .unwrap(); + + rx.recv_timeout(std::time::Duration::from_secs(20)).unwrap(); + let mut count = 0; + for path in &cluster.paths { + let sst_dir = path.path().join("import-sst"); + for entry in std::fs::read_dir(sst_dir).unwrap() { + let entry = entry.unwrap(); + if entry.file_type().unwrap().is_file() { + count += 1; + } + } + } + fail::remove("on_cleanup_import_sst"); + fail::remove("on_cleanup_import_sst_schedule"); + assert_ne!(0, count); +} diff --git a/tests/failpoints/cases/test_merge.rs b/tests/failpoints/cases/test_merge.rs index 1a733be5d8c..16796cfa555 100644 --- a/tests/failpoints/cases/test_merge.rs +++ b/tests/failpoints/cases/test_merge.rs @@ -1530,7 +1530,7 @@ fn test_retry_pending_prepare_merge_fail() { let (propose_tx, propose_rx) = mpsc::sync_channel(10); fail::cfg_callback("after_propose", move || propose_tx.send(()).unwrap()).unwrap(); - let rx = cluster.async_put(b"k1", b"v11").unwrap(); + let mut rx = cluster.async_put(b"k1", b"v11").unwrap(); propose_rx.recv_timeout(Duration::from_secs(2)).unwrap(); rx.recv_timeout(Duration::from_millis(200)).unwrap_err(); diff --git a/tests/failpoints/cases/test_replica_read.rs b/tests/failpoints/cases/test_replica_read.rs index 9f844f582e4..64f363f0ced 100644 --- a/tests/failpoints/cases/test_replica_read.rs +++ b/tests/failpoints/cases/test_replica_read.rs @@ -7,17 +7,21 @@ use std::{ }; use crossbeam::channel; -use engine_traits::{Peekable, RaftEngineReadOnly, CF_RAFT}; +use engine_traits::RaftEngineReadOnly; use futures::executor::block_on; use kvproto::raft_serverpb::{PeerState, RaftMessage, RegionLocalState}; use raft::eraftpb::MessageType; -use test_raftstore::*; -use tikv_util::{config::ReadableDuration, HandyRwLock}; +use test_raftstore::{Simulator as S1, *}; +use test_raftstore_macro::test_case; +use test_raftstore_v2::Simulator as S2; +use tikv::storage::config::EngineType; +use tikv_util::{config::ReadableDuration, future::block_on_timeout, HandyRwLock}; use txn_types::{Key, Lock, LockType}; -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_wait_for_apply_index() { - let mut cluster = new_server_cluster(0, 3); + let mut cluster = new_cluster(0, 3); // Increase the election tick to make this test case running reliably. configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10_000)); @@ -52,18 +56,13 @@ fn test_wait_for_apply_index() { ); request.mut_header().set_peer(p3); request.mut_header().set_replica_read(true); - let (cb, rx) = make_cb(&request); - cluster - .sim - .rl() - .async_command_on_node(3, request, cb) - .unwrap(); + let mut rx = async_command_on_node(&mut cluster, 3, request); // Must timeout here - rx.recv_timeout(Duration::from_millis(500)).unwrap_err(); + block_on_timeout(rx.as_mut(), Duration::from_millis(500)).unwrap_err(); fail::remove("on_apply_write_cmd"); // After write cmd applied, the follower read will be executed. - match rx.recv_timeout(Duration::from_secs(3)) { + match block_on_timeout(rx.as_mut(), Duration::from_secs(3)) { Ok(resp) => { assert_eq!(resp.get_responses().len(), 1); assert_eq!(resp.get_responses()[0].get_get().get_value(), b"v1"); @@ -72,10 +71,11 @@ fn test_wait_for_apply_index() { } } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_duplicate_read_index_ctx() { // Initialize cluster - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10_000)); cluster.cfg.raft_store.raft_heartbeat_ticks = 1; let pd_client = Arc::clone(&cluster.pd_client); @@ -118,43 +118,40 @@ fn test_duplicate_read_index_ctx() { true, ); request.mut_header().set_peer(p2); - let (cb2, rx2) = make_cb(&request); + // In v2, we use replica read to force issue a read index. + if cluster.cfg.storage.engine == EngineType::RaftKv2 { + request.mut_requests()[0] = new_get_cmd(b"k0"); + request.mut_header().set_replica_read(true); + } // send to peer 2 - cluster - .sim - .rl() - .async_command_on_node(2, request.clone(), cb2) - .unwrap(); + let mut rx2 = async_command_on_node(&mut cluster, 2, request.clone()); rx.recv_timeout(Duration::from_secs(5)).unwrap(); must_get_equal(&cluster.get_engine(3), b"k0", b"v0"); request.mut_header().set_peer(p3); - let (cb3, rx3) = make_cb(&request); // send to peer 3 - cluster - .sim - .rl() - .async_command_on_node(3, request, cb3) - .unwrap(); + let mut rx3 = async_command_on_node(&mut cluster, 3, request); rx.recv_timeout(Duration::from_secs(5)).unwrap(); let router = cluster.sim.wl().get_router(1).unwrap(); fail::cfg("pause_on_peer_collect_message", "pause").unwrap(); cluster.sim.wl().clear_recv_filters(1); for raft_msg in std::mem::take(&mut *dropped_msgs.lock().unwrap()) { - router.send_raft_message(raft_msg).unwrap(); + #[allow(clippy::useless_conversion)] + router.send_raft_message(raft_msg.into()).unwrap(); } fail::remove("pause_on_peer_collect_message"); // read index response must not be dropped - rx2.recv_timeout(Duration::from_secs(5)).unwrap(); - rx3.recv_timeout(Duration::from_secs(5)).unwrap(); + block_on_timeout(rx2.as_mut(), Duration::from_secs(5)).unwrap(); + block_on_timeout(rx3.as_mut(), Duration::from_secs(5)).unwrap(); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_read_before_init() { // Initialize cluster - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10_000)); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -184,13 +181,8 @@ fn test_read_before_init() { ); request.mut_header().set_peer(p3); request.mut_header().set_replica_read(true); - let (cb, rx) = make_cb(&request); - cluster - .sim - .rl() - .async_command_on_node(3, request, cb) - .unwrap(); - let resp = rx.recv_timeout(Duration::from_secs(5)).unwrap(); + let mut rx = async_command_on_node(&mut cluster, 3, request); + let resp = block_on_timeout(rx.as_mut(), Duration::from_secs(5)).unwrap(); fail::remove("before_handle_snapshot_ready_3"); assert!( resp.get_header() @@ -202,10 +194,11 @@ fn test_read_before_init() { ); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_read_applying_snapshot() { // Initialize cluster - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10_000)); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -224,14 +217,11 @@ fn test_read_applying_snapshot() { cluster.pd_client.must_add_peer(r1, p3.clone()); thread::sleep(Duration::from_millis(500)); - // Check if peer 3 is applying snapshot - let region_key = keys::region_state_key(r1); - let region_state: RegionLocalState = cluster - .get_engine(3) - .get_msg_cf(CF_RAFT, ®ion_key) - .unwrap() - .unwrap(); - assert_eq!(region_state.get_state(), PeerState::Applying); + // Check if peer 3 is applying snapshot for raftstore v1. + if cluster.cfg.storage.engine == EngineType::RaftKv { + let region_state: RegionLocalState = cluster.region_local_state(r1, 3); + assert_eq!(region_state.get_state(), PeerState::Applying); + } let region = cluster.get_region(b"k0"); assert_eq!(cluster.leader_of_region(r1).unwrap(), p1); @@ -243,28 +233,26 @@ fn test_read_applying_snapshot() { ); request.mut_header().set_peer(p3); request.mut_header().set_replica_read(true); - let (cb, rx) = make_cb(&request); - cluster - .sim - .rl() - .async_command_on_node(3, request, cb) - .unwrap(); - let resp = match rx.recv_timeout(Duration::from_secs(5)) { - Ok(r) => r, + let mut rx = async_command_on_node(&mut cluster, 3, request); + match block_on_timeout(rx.as_mut(), Duration::from_secs(5)) { + Ok(resp) => { + // In raftstore v1, read fails due to snapshot. + assert!(cluster.cfg.storage.engine == EngineType::RaftKv); + assert!( + resp.get_header() + .get_error() + .get_message() + .contains("applying snapshot"), + "{:?}", + resp.get_header().get_error() + ); + } Err(_) => { - fail::remove("region_apply_snap"); - panic!("cannot receive response"); + // In raftstore v2, snapshot blocks reads. + assert!(cluster.cfg.storage.engine == EngineType::RaftKv2); } }; fail::remove("region_apply_snap"); - assert!( - resp.get_header() - .get_error() - .get_message() - .contains("applying snapshot"), - "{:?}", - resp.get_header().get_error() - ); } #[test] @@ -328,7 +316,7 @@ fn test_read_after_cleanup_range_for_snap() { request.mut_header().set_peer(p3); request.mut_header().set_replica_read(true); // Send follower read request to peer 3 - let (cb1, rx1) = make_cb(&request); + let (cb1, mut rx1) = make_cb(&request); cluster .sim .rl() @@ -362,9 +350,10 @@ fn test_read_after_cleanup_range_for_snap() { /// slowly and drops the no-op entry from the new leader, and it had to wait for /// a heartbeat timeout to know its leader before that it can't handle any read /// request. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +// #[test_case(test_raftstore_v2::new_node_cluster)] fn test_new_split_learner_can_not_find_leader() { - let mut cluster = new_node_cluster(0, 4); + let mut cluster = new_cluster(0, 4); configure_for_lease_read(&mut cluster.cfg, Some(5000), None); let pd_client = Arc::clone(&cluster.pd_client); @@ -400,16 +389,17 @@ fn test_new_split_learner_can_not_find_leader() { let new_region = cluster.get_region(b"k2"); let learner_peer = find_peer(&new_region, 3).unwrap().clone(); let resp_ch = async_read_on_peer(&mut cluster, learner_peer, new_region, b"k2", true, true); - let resp = resp_ch.recv_timeout(Duration::from_secs(3)).unwrap(); + let resp = block_on_timeout(resp_ch, Duration::from_secs(3)).unwrap(); let exp_value = resp.get_responses()[0].get_get().get_value(); assert_eq!(exp_value, b"v2"); } /// Test if the read index request can get a correct response when the commit /// index of leader if not up-to-date after transferring leader. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_replica_read_after_transfer_leader() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); configure_for_lease_read(&mut cluster.cfg, Some(50), Some(100)); @@ -466,7 +456,8 @@ fn test_replica_read_after_transfer_leader() { let router = cluster.sim.wl().get_router(2).unwrap(); for raft_msg in std::mem::take(&mut *dropped_msgs.lock().unwrap()) { - router.send_raft_message(raft_msg).unwrap(); + #[allow(clippy::useless_conversion)] + router.send_raft_message(raft_msg.into()).unwrap(); } let new_region = cluster.get_region(b"k1"); @@ -476,16 +467,17 @@ fn test_replica_read_after_transfer_leader() { fail::remove(on_peer_collect_message_2); - let resp = resp_ch.recv_timeout(Duration::from_secs(3)).unwrap(); + let resp = block_on_timeout(resp_ch, Duration::from_secs(3)).unwrap(); let exp_value = resp.get_responses()[0].get_get().get_value(); assert_eq!(exp_value, b"v2"); } // This test is for reproducing the bug that some replica reads was sent to a // leader and shared a same read index because of the optimization on leader. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +// #[test_case(test_raftstore_v2::new_node_cluster)] fn test_read_index_after_transfer_leader() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); configure_for_lease_read(&mut cluster.cfg, Some(50), Some(100)); @@ -544,10 +536,12 @@ fn test_read_index_after_transfer_leader() { // Send heartbeat and append responses to advance read index. let router = cluster.sim.wl().get_router(2).unwrap(); for msg in append_msgs { - router.send_raft_message(msg.clone()).unwrap(); + #[allow(clippy::useless_conversion)] + router.send_raft_message(msg.clone().into()).unwrap(); } for msg in heartbeat_msgs { - router.send_raft_message(msg.clone()).unwrap(); + #[allow(clippy::useless_conversion)] + router.send_raft_message(msg.clone().into()).unwrap(); } fail::remove(on_peer_collect_message_2); // Wait for read index has been advanced. @@ -563,11 +557,12 @@ fn test_read_index_after_transfer_leader() { ) }); for msg in vote_msgs { - router.send_raft_message(msg.clone()).unwrap(); + #[allow(clippy::useless_conversion)] + router.send_raft_message(msg.clone().into()).unwrap(); } - for resp in responses { - resp.recv_timeout(Duration::from_millis(200)).unwrap(); + for mut resp in responses { + block_on_timeout(resp.as_mut(), Duration::from_millis(200)).unwrap(); } cluster.sim.wl().clear_recv_filters(2); @@ -576,7 +571,8 @@ fn test_read_index_after_transfer_leader() { /// Test if the read index request can get a correct response when the commit /// index of leader if not up-to-date after transferring leader. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +// #[test_case(test_raftstore_v2::new_node_cluster)] fn test_batch_read_index_after_transfer_leader() { let mut cluster = new_node_cluster(0, 3); configure_for_lease_read(&mut cluster.cfg, Some(50), Some(100)); @@ -615,7 +611,8 @@ fn test_batch_read_index_after_transfer_leader() { let router = cluster.sim.wl().get_router(2).unwrap(); for raft_msg in std::mem::take(&mut *dropped_msgs.lock().unwrap()) { - router.send_raft_message(raft_msg).unwrap(); + #[allow(clippy::useless_conversion)] + router.send_raft_message(raft_msg.into()).unwrap(); } let mut resps = Vec::with_capacity(2); @@ -633,7 +630,7 @@ fn test_batch_read_index_after_transfer_leader() { let resps = resps .into_iter() - .map(|x| x.recv_timeout(Duration::from_secs(5)).unwrap()) + .map(|mut x| x.recv_timeout(Duration::from_secs(5)).unwrap()) .collect::>(); // `term` in the header is `current_term`, not term of the entry at @@ -653,9 +650,13 @@ fn test_batch_read_index_after_transfer_leader() { } } -#[test] +// Read index on follower must also return KeyIsLocked error. +// +// Note: this test case does not applicable to raftstore v2, because it no +// longer support read index from users. +#[test_case(test_raftstore::new_node_cluster)] fn test_read_index_lock_checking_on_follower() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); let pd_client = Arc::clone(&cluster.pd_client); pd_client.disable_default_operator(); @@ -675,7 +676,7 @@ fn test_read_index_lock_checking_on_follower() { fail::cfg("before_propose_readindex", "1*pause").unwrap(); let mut resp = async_read_index_on_peer(&mut cluster, new_peer(2, 2), r1.clone(), b"k1", true); for i in 0..=20 { - let res = resp.recv_timeout(Duration::from_millis(500)); + let res = block_on_timeout(resp.as_mut(), Duration::from_millis(500)); if res.is_err() { break; } @@ -717,7 +718,7 @@ fn test_read_index_lock_checking_on_follower() { // We must make sure the lock check is done on peer 3. fail::remove("before_propose_readindex"); - let resp = resp.recv_timeout(Duration::from_millis(2000)).unwrap(); + let resp = block_on_timeout(resp.as_mut(), Duration::from_millis(2000)).unwrap(); assert_eq!( &lock.into_lock_info(b"k1".to_vec()), resp.get_responses()[0].get_read_index().get_locked(), @@ -726,9 +727,10 @@ fn test_read_index_lock_checking_on_follower() { ); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_read_index_lock_checking_on_false_leader() { - let mut cluster = new_node_cluster(0, 5); + let mut cluster = new_cluster(0, 5); // Use long election timeout and short lease. configure_for_lease_read(&mut cluster.cfg, Some(50), Some(200)); cluster.cfg.raft_store.raft_store_max_leader_lease = @@ -793,8 +795,8 @@ fn test_read_index_lock_checking_on_false_leader() { // Read index from peer 2, the read index message will be sent to the old leader // peer 1. But the lease of peer 1 has expired and it cannot get majority of // heartbeat. So, we cannot get the result here. - let resp = async_read_index_on_peer(&mut cluster, new_peer(2, 2), r1, b"k1", true); - resp.recv_timeout(Duration::from_millis(300)).unwrap_err(); + let mut resp = async_read_index_on_peer(&mut cluster, new_peer(2, 2), r1, b"k1", true); + block_on_timeout(resp.as_mut(), Duration::from_millis(300)).unwrap_err(); // Now, restore the network partition. Peer 1 should now become follower and // drop its pending read index request. Peer 2 cannot get the result now. @@ -805,10 +807,10 @@ fn test_read_index_lock_checking_on_false_leader() { ); cluster.sim.wl().add_recv_filter(2, recv_filter); cluster.clear_send_filters(); - resp.recv_timeout(Duration::from_millis(300)).unwrap_err(); + block_on_timeout(resp.as_mut(), Duration::from_millis(300)).unwrap_err(); // After cleaning all filters, peer 2 will retry and will get error. cluster.sim.wl().clear_recv_filters(2); - let resp = resp.recv_timeout(Duration::from_millis(2000)).unwrap(); + let resp = block_on_timeout(resp.as_mut(), Duration::from_secs(2)).unwrap(); assert!(resp.get_header().has_error()); } diff --git a/tests/failpoints/cases/test_snap.rs b/tests/failpoints/cases/test_snap.rs index 4ca18dcd716..a090ba8530c 100644 --- a/tests/failpoints/cases/test_snap.rs +++ b/tests/failpoints/cases/test_snap.rs @@ -641,33 +641,31 @@ fn test_snapshot_gc_after_failed() { cluster.sim.wl().clear_recv_filters(3); } -#[test] +#[test_case(test_raftstore::new_server_cluster)] +#[test_case(test_raftstore_v2::new_server_cluster)] fn test_sending_fail_with_net_error() { - let mut cluster = new_server_cluster(1, 2); + let mut cluster = new_cluster(1, 2); configure_for_snapshot(&mut cluster.cfg); cluster.cfg.raft_store.snap_gc_timeout = ReadableDuration::millis(300); - let pd_client = Arc::clone(&cluster.pd_client); - // Disable default max peer count check. + let pd_client = cluster.pd_client.clone(); + // Disable default max peer number check. pd_client.disable_default_operator(); let r1 = cluster.run_conf_change(); cluster.must_put(b"k1", b"v1"); let (send_tx, send_rx) = mpsc::sync_channel(1); // only send one MessageType::MsgSnapshot message - cluster.sim.wl().add_send_filter( - 1, - Box::new( - RegionPacketFilter::new(r1, 1) - .allow(1) - .direction(Direction::Send) - .msg_type(MessageType::MsgSnapshot) - .set_msg_callback(Arc::new(move |m: &RaftMessage| { - if m.get_message().get_msg_type() == MessageType::MsgSnapshot { - let _ = send_tx.send(()); - } - })), - ), - ); + cluster.add_send_filter(CloneFilterFactory( + RegionPacketFilter::new(r1, 1) + .allow(1) + .direction(Direction::Send) + .msg_type(MessageType::MsgSnapshot) + .set_msg_callback(Arc::new(move |m: &RaftMessage| { + if m.get_message().get_msg_type() == MessageType::MsgSnapshot { + let _ = send_tx.send(()); + } + })), + )); // peer2 will interrupt in receiving snapshot fail::cfg("receiving_snapshot_net_error", "return()").unwrap(); @@ -678,8 +676,8 @@ fn test_sending_fail_with_net_error() { // need to wait receiver handle the snapshot request sleep_ms(100); - // peer2 will not become learner so ti will has k1 key and receiving count will - // zero + // peer2 can't receive any snapshot, so it doesn't have any key valuse. + // but the receiving_count should be zero if receiving snapshot is failed. let engine2 = cluster.get_engine(2); must_get_none(&engine2, b"k1"); assert_eq!(cluster.get_snap_mgr(2).stats().receiving_count, 0); diff --git a/tests/failpoints/cases/test_stale_read.rs b/tests/failpoints/cases/test_stale_read.rs index 475ed71a1b0..523bb54f7cb 100644 --- a/tests/failpoints/cases/test_stale_read.rs +++ b/tests/failpoints/cases/test_stale_read.rs @@ -455,7 +455,7 @@ fn test_read_after_peer_destroyed() { false, ); request.mut_header().set_peer(new_peer(1, 1)); - let (cb, rx) = make_cb(&request); + let (cb, mut rx) = make_cb(&request); cluster .sim .rl() diff --git a/tests/failpoints/cases/test_transaction.rs b/tests/failpoints/cases/test_transaction.rs index 564b5f393ec..a0e69108125 100644 --- a/tests/failpoints/cases/test_transaction.rs +++ b/tests/failpoints/cases/test_transaction.rs @@ -13,8 +13,8 @@ use futures::executor::block_on; use grpcio::{ChannelBuilder, Environment}; use kvproto::{ kvrpcpb::{ - self as pb, AssertionLevel, Context, Op, PessimisticLockRequest, PrewriteRequest, - PrewriteRequestPessimisticAction::*, + self as pb, AssertionLevel, Context, GetRequest, Op, PessimisticLockRequest, + PrewriteRequest, PrewriteRequestPessimisticAction::*, }, tikvpb::TikvClient, }; @@ -26,7 +26,7 @@ use storage::{ }, txn::{self, commands}, }; -use test_raftstore::new_server_cluster; +use test_raftstore::{configure_for_lease_read, new_server_cluster}; use tikv::storage::{ self, kv::SnapshotExt, @@ -609,3 +609,72 @@ fn test_concurrent_write_after_transfer_leader_invalidates_locks() { &lock.into_lock().into_lock_info(b"key".to_vec()) ); } + +#[test] +fn test_read_index_with_max_ts() { + let mut cluster = new_server_cluster(0, 3); + // Increase the election tick to make this test case running reliably. + // Use async apply prewrite to let tikv response before applying on the leader + // peer. + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(10_000)); + cluster.cfg.storage.enable_async_apply_prewrite = true; + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + + let k0 = b"k0"; + let v0 = b"v0"; + let r1 = cluster.run_conf_change(); + let p2 = new_peer(2, 2); + cluster.pd_client.must_add_peer(r1, p2.clone()); + let p3 = new_peer(3, 3); + cluster.pd_client.must_add_peer(r1, p3.clone()); + cluster.must_put(k0, v0); + cluster.pd_client.must_none_pending_peer(p2.clone()); + cluster.pd_client.must_none_pending_peer(p3.clone()); + + let region = cluster.get_region(k0); + cluster.must_transfer_leader(region.get_id(), p3.clone()); + + // Block all write cmd applying of Peer 3(leader), then start to write to it. + let k1 = b"k1"; + let v1 = b"v1"; + let mut ctx_p3 = Context::default(); + ctx_p3.set_region_id(region.get_id()); + ctx_p3.set_region_epoch(region.get_region_epoch().clone()); + ctx_p3.set_peer(p3.clone()); + let mut ctx_p2 = ctx_p3.clone(); + ctx_p2.set_peer(p2.clone()); + + let start_ts = 10; + let mut mutation = pb::Mutation::default(); + mutation.set_op(Op::Put); + mutation.key = k1.to_vec(); + mutation.value = v1.to_vec(); + let mut req = PrewriteRequest::default(); + req.set_context(ctx_p3); + req.set_mutations(vec![mutation].into()); + req.set_start_version(start_ts); + req.try_one_pc = true; + req.set_primary_lock(k1.to_vec()); + + let env = Arc::new(Environment::new(1)); + let channel = + ChannelBuilder::new(env.clone()).connect(&cluster.sim.rl().get_addr(p3.get_store_id())); + let client_p3 = TikvClient::new(channel); + fail::cfg("on_apply_write_cmd", "sleep(2000)").unwrap(); + client_p3.kv_prewrite(&req).unwrap(); + + // The apply is blocked on leader, so the read index request with max ts should + // see the memory lock as it would be dropped after finishing apply. + let channel = ChannelBuilder::new(env).connect(&cluster.sim.rl().get_addr(p2.get_store_id())); + let client_p2 = TikvClient::new(channel); + let mut req = GetRequest::new(); + req.key = k1.to_vec(); + req.version = u64::MAX; + ctx_p2.replica_read = true; + req.set_context(ctx_p2); + let resp = client_p2.kv_get(&req).unwrap(); + assert!(resp.region_error.is_none()); + assert_eq!(resp.error.unwrap().locked.unwrap().lock_version, start_ts); + fail::remove("on_apply_write_cmd"); +} diff --git a/tests/integrations/config/mod.rs b/tests/integrations/config/mod.rs index 39d3b9da57f..9f46202f785 100644 --- a/tests/integrations/config/mod.rs +++ b/tests/integrations/config/mod.rs @@ -194,9 +194,11 @@ fn test_serde_custom_tikv_config() { region_split_check_diff: Some(ReadableSize::mb(20)), region_compact_check_interval: ReadableDuration::secs(12), clean_stale_peer_delay: ReadableDuration::secs(0), - region_compact_check_step: 1_234, + region_compact_check_step: Some(1_234), region_compact_min_tombstones: 999, region_compact_tombstones_percent: 33, + region_compact_min_redundant_rows: 999, + region_compact_redundant_rows_percent: 33, pd_heartbeat_tick_interval: ReadableDuration::minutes(12), pd_store_heartbeat_tick_interval: ReadableDuration::secs(12), notify_capacity: 12_345, diff --git a/tests/integrations/config/test-custom.toml b/tests/integrations/config/test-custom.toml index f0c3dac0c04..7ba70b0a6f6 100644 --- a/tests/integrations/config/test-custom.toml +++ b/tests/integrations/config/test-custom.toml @@ -167,6 +167,8 @@ clean-stale-peer-delay = "0s" region-compact-check-step = 1234 region-compact-min-tombstones = 999 region-compact-tombstones-percent = 33 +region-compact-min-redundant-rows = 999 +region-compact-redundant-rows-percent = 33 pd-heartbeat-tick-interval = "12m" pd-store-heartbeat-tick-interval = "12s" snap-mgr-gc-tick-interval = "12m" diff --git a/tests/integrations/import/util.rs b/tests/integrations/import/util.rs index e6e2121a479..cb1e0e336be 100644 --- a/tests/integrations/import/util.rs +++ b/tests/integrations/import/util.rs @@ -2,6 +2,7 @@ use std::{sync::Arc, thread, time::Duration}; +use engine_rocks::RocksEngine; use futures::{executor::block_on, stream, SinkExt}; use grpcio::{ChannelBuilder, Environment, Result, WriteFlags}; use kvproto::{import_sstpb::*, kvrpcpb::*, tikvpb::*}; @@ -68,6 +69,57 @@ pub fn open_cluster_and_tikv_import_client( (cluster, ctx, tikv, import) } +#[allow(dead_code)] +pub fn open_cluster_and_tikv_import_client_v2( + cfg: Option, + cluster: &mut test_raftstore_v2::Cluster< + test_raftstore_v2::ServerCluster, + RocksEngine, + >, +) -> (Context, TikvClient, ImportSstClient) { + let cfg = cfg.unwrap_or_else(|| { + let mut config = TikvConfig::default(); + config.server.addr = "127.0.0.1:0".to_owned(); + let cleanup_interval = Duration::from_millis(10); + config.raft_store.cleanup_import_sst_interval.0 = cleanup_interval; + config.server.grpc_concurrency = 1; + config + }); + cluster.cfg = Config { + tikv: cfg.clone(), + prefer_mem: true, + }; + cluster.run(); + + let region_id = 1; + let leader = cluster.leader_of_region(region_id).unwrap(); + let epoch = cluster.get_region_epoch(region_id); + let mut ctx = Context::default(); + ctx.set_region_id(region_id); + ctx.set_peer(leader); + ctx.set_region_epoch(epoch); + + let ch = { + let env = Arc::new(Environment::new(1)); + let node = ctx.get_peer().get_store_id(); + let builder = ChannelBuilder::new(env) + .http2_max_ping_strikes(i32::MAX) // For pings without data from clients. + .keepalive_time(cluster.cfg.server.grpc_keepalive_time.into()) + .keepalive_timeout(cluster.cfg.server.grpc_keepalive_timeout.into()); + + if cfg.security != SecurityConfig::default() { + let creds = test_util::new_channel_cred(); + builder.secure_connect(&cluster.sim.rl().get_addr(node), creds) + } else { + builder.connect(&cluster.sim.rl().get_addr(node)) + } + }; + let tikv = TikvClient::new(ch.clone()); + let import = ImportSstClient::new(ch); + + (ctx, tikv, import) +} + pub fn new_cluster_and_tikv_import_client() -> (Cluster, Context, TikvClient, ImportSstClient) { open_cluster_and_tikv_import_client(None) diff --git a/tests/integrations/raftstore/test_compact_after_delete.rs b/tests/integrations/raftstore/test_compact_after_delete.rs index 13cfb535e97..6ba405bb918 100644 --- a/tests/integrations/raftstore/test_compact_after_delete.rs +++ b/tests/integrations/raftstore/test_compact_after_delete.rs @@ -5,8 +5,9 @@ use std::{ time::Duration, }; +use collections::HashMap; use engine_rocks::{raw::Range, util::get_cf_handle}; -use engine_traits::{MiscExt, CF_WRITE}; +use engine_traits::{CachedTablet, MiscExt, CF_WRITE}; use keys::{data_key, DATA_MAX_KEY}; use test_raftstore::*; use tikv::storage::mvcc::{TimeStamp, Write, WriteType}; @@ -35,7 +36,7 @@ fn test_compact_after_delete(cluster: &mut Cluster) { cluster.cfg.raft_store.region_compact_check_interval = ReadableDuration::millis(100); cluster.cfg.raft_store.region_compact_min_tombstones = 500; cluster.cfg.raft_store.region_compact_tombstones_percent = 50; - cluster.cfg.raft_store.region_compact_check_step = 1; + cluster.cfg.raft_store.region_compact_check_step = Some(1); cluster.cfg.rocksdb.titan.enabled = true; cluster.run(); @@ -85,3 +86,153 @@ fn test_node_compact_after_delete() { let mut cluster = new_node_cluster(0, count); test_compact_after_delete(&mut cluster); } + +#[test] +fn test_node_compact_after_delete_v2() { + let count = 1; + let mut cluster = test_raftstore_v2::new_node_cluster(0, count); + + cluster.cfg.raft_store.region_compact_check_interval = ReadableDuration::millis(100); + cluster.cfg.raft_store.region_compact_min_tombstones = 50; + cluster.cfg.raft_store.region_compact_tombstones_percent = 50; + // disable it + cluster.cfg.raft_store.region_compact_min_redundant_rows = 10000000; + cluster.cfg.raft_store.region_compact_check_step = Some(2); + cluster.cfg.rocksdb.titan.enabled = true; + cluster.run(); + + let region = cluster.get_region(b""); + let (split_key, _) = gen_mvcc_put_kv(b"k100", b"", 1.into(), 2.into()); + cluster.must_split(®ion, &split_key); + + for i in 0..200 { + let (k, v) = (format!("k{:03}", i), format!("value{}", i)); + let (k, v) = gen_mvcc_put_kv(k.as_bytes(), v.as_bytes(), 1.into(), 2.into()); + cluster.must_put_cf(CF_WRITE, &k, &v); + } + for (registry, _) in &cluster.engines { + registry.for_each_opened_tablet(|_, db: &mut CachedTablet<_>| { + if let Some(db) = db.latest() { + db.flush_cf(CF_WRITE, true).unwrap(); + } + true + }) + } + + let (sender, receiver) = mpsc::channel(); + let sync_sender = Mutex::new(sender); + fail::cfg_callback("raftstore-v2::CheckAndCompact::AfterCompact", move || { + let sender = sync_sender.lock().unwrap(); + sender.send(true).unwrap(); + }) + .unwrap(); + for i in 0..200 { + let k = format!("k{:03}", i); + let k = gen_delete_k(k.as_bytes(), 2.into()); + cluster.must_delete_cf(CF_WRITE, &k); + } + for (registry, _) in &cluster.engines { + registry.for_each_opened_tablet(|_, db: &mut CachedTablet<_>| { + if let Some(db) = db.latest() { + db.flush_cf(CF_WRITE, true).unwrap(); + } + true + }) + } + + // wait for 2 regions' compaction. + receiver.recv_timeout(Duration::from_millis(5000)).unwrap(); + receiver.recv_timeout(Duration::from_millis(5000)).unwrap(); + + for (registry, _) in &cluster.engines { + registry.for_each_opened_tablet(|_, db: &mut CachedTablet<_>| { + if let Some(db) = db.latest() { + let cf_handle = get_cf_handle(db.as_inner(), CF_WRITE).unwrap(); + let approximate_size = db + .as_inner() + .get_approximate_sizes_cf(cf_handle, &[Range::new(b"", DATA_MAX_KEY)])[0]; + assert_eq!(approximate_size, 0); + } + true + }) + } +} + +#[test] +fn test_node_compact_after_update_v2() { + let count = 1; + let mut cluster = test_raftstore_v2::new_node_cluster(0, count); + + cluster.cfg.raft_store.region_compact_check_interval = ReadableDuration::millis(100); + // disable it + cluster.cfg.raft_store.region_compact_min_tombstones = 1000000; + cluster.cfg.raft_store.region_compact_redundant_rows_percent = 40; + cluster.cfg.raft_store.region_compact_min_redundant_rows = 50; + cluster.cfg.raft_store.region_compact_check_step = Some(2); + cluster.cfg.rocksdb.titan.enabled = true; + cluster.run(); + + let region = cluster.get_region(b""); + let (split_key, _) = gen_mvcc_put_kv(b"k100", b"", 1.into(), 2.into()); + cluster.must_split(®ion, &split_key); + + for i in 0..200 { + let (k, v) = (format!("k{:03}", i), format!("value{}", i)); + let (k, v) = gen_mvcc_put_kv(k.as_bytes(), v.as_bytes(), 1.into(), 2.into()); + cluster.must_put_cf(CF_WRITE, &k, &v); + + let (k, v) = (format!("k{:03}", i), format!("value{}", i)); + let (k, v) = gen_mvcc_put_kv(k.as_bytes(), v.as_bytes(), 3.into(), 4.into()); + cluster.must_put_cf(CF_WRITE, &k, &v); + } + for (registry, _) in &cluster.engines { + registry.for_each_opened_tablet(|_, db: &mut CachedTablet<_>| { + if let Some(db) = db.latest() { + db.flush_cf(CF_WRITE, true).unwrap(); + } + true + }) + } + + fail::cfg("on_collect_regions_to_compact", "pause").unwrap(); + let mut db_size_before_compact = HashMap::default(); + for (registry, _) in &cluster.engines { + registry.for_each_opened_tablet(|id, db: &mut CachedTablet<_>| { + if let Some(db) = db.latest() { + let cf_handle = get_cf_handle(db.as_inner(), CF_WRITE).unwrap(); + let approximate_size = db + .as_inner() + .get_approximate_sizes_cf(cf_handle, &[Range::new(b"", DATA_MAX_KEY)])[0]; + db_size_before_compact.insert(id, approximate_size); + } + true + }) + } + fail::remove("on_collect_regions_to_compact"); + + let (sender, receiver) = mpsc::channel(); + let sync_sender = Mutex::new(sender); + fail::cfg_callback("raftstore-v2::CheckAndCompact::AfterCompact", move || { + let sender = sync_sender.lock().unwrap(); + sender.send(true).unwrap(); + }) + .unwrap(); + + // wait for 2 regions' compaction. + receiver.recv_timeout(Duration::from_millis(5000)).unwrap(); + receiver.recv_timeout(Duration::from_millis(5000)).unwrap(); + + for (registry, _) in &cluster.engines { + registry.for_each_opened_tablet(|id, db: &mut CachedTablet<_>| { + if let Some(db) = db.latest() { + let cf_handle = get_cf_handle(db.as_inner(), CF_WRITE).unwrap(); + let approximate_size = db + .as_inner() + .get_approximate_sizes_cf(cf_handle, &[Range::new(b"", DATA_MAX_KEY)])[0]; + let size_before = db_size_before_compact.get(&id).unwrap(); + assert!(approximate_size < *size_before); + } + true + }) + } +} diff --git a/tests/integrations/raftstore/test_flashback.rs b/tests/integrations/raftstore/test_flashback.rs index eec5ea9b94c..0b703cf32dd 100644 --- a/tests/integrations/raftstore/test_flashback.rs +++ b/tests/integrations/raftstore/test_flashback.rs @@ -5,40 +5,58 @@ use std::{ time::{Duration, Instant}, }; -use futures::{channel::oneshot, executor::block_on}; +use engine_rocks::RocksEngine; +use futures::executor::block_on; use kvproto::{ errorpb::FlashbackInProgress, metapb, - raft_cmdpb::{AdminCmdType, CmdType, RaftCmdResponse, Request}, + raft_cmdpb::{AdminCmdType, CmdType, RaftCmdRequest, RaftCmdResponse, Request}, + raft_serverpb::RegionLocalState, }; use raftstore::store::Callback; use test_raftstore::*; +use test_raftstore_macro::test_case; use txn_types::WriteBatchFlags; const TEST_KEY: &[u8] = b"k1"; const TEST_VALUE: &[u8] = b"v1"; -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_allow_read_only_request() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); + configure_for_lease_read(&mut cluster.cfg, Some(50), Some(30)); cluster.run(); cluster.must_transfer_leader(1, new_peer(1, 1)); + cluster.must_put(TEST_KEY, TEST_VALUE); let mut region = cluster.get_region(TEST_KEY); - let mut snap_req = Request::default(); - snap_req.set_cmd_type(CmdType::Snap); - // Get snapshot normally. - let snap_resp = request(&mut cluster, &mut region.clone(), snap_req.clone(), false); - assert!(!snap_resp.get_header().has_error()); - // Get snapshot with flashback flag without in the flashback state. - let snap_resp = request(&mut cluster, &mut region.clone(), snap_req.clone(), true); - assert!(!snap_resp.get_header().has_error()); - // Get snapshot with flashback flag with in the flashback state. + let mut get_req = Request::default(); + get_req.set_cmd_type(CmdType::Get); + // Get normally. + let snap_resp = request(&mut cluster, &mut region.clone(), get_req.clone(), false); + assert!( + !snap_resp.get_header().has_error(), + "{:?}", + snap_resp.get_header() + ); + // Get with flashback flag without in the flashback state. + let snap_resp = request(&mut cluster, &mut region.clone(), get_req.clone(), true); + assert!( + !snap_resp.get_header().has_error(), + "{:?}", + snap_resp.get_header() + ); + // Get with flashback flag with in the flashback state. cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::PrepareFlashback); - let snap_resp = request(&mut cluster, &mut region.clone(), snap_req.clone(), true); - assert!(!snap_resp.get_header().has_error()); - // Get snapshot without flashback flag with in the flashback state. - let snap_resp = request(&mut cluster, &mut region, snap_req, false); + let snap_resp = request(&mut cluster, &mut region.clone(), get_req.clone(), true); + assert!( + !snap_resp.get_header().has_error(), + "{:?}", + snap_resp.get_header() + ); + // Get without flashback flag with in the flashback state. + let snap_resp = request(&mut cluster, &mut region, get_req, false); assert!( snap_resp .get_header() @@ -51,10 +69,11 @@ fn test_allow_read_only_request() { cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); } -#[test] #[cfg(feature = "failpoints")] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_read_after_prepare_flashback() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); cluster.run(); cluster.must_transfer_leader(1, new_peer(1, 1)); @@ -73,7 +92,9 @@ fn test_read_after_prepare_flashback() { cluster.must_send_wait_flashback_msg(region.get_id(), AdminCmdType::FinishFlashback); } -#[test] +#[cfg(feature = "failpoints")] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_prepare_flashback_after_split() { let mut cluster = new_node_cluster(0, 3); cluster.run(); @@ -98,23 +119,13 @@ fn test_prepare_flashback_after_split() { // Make sure the admin split cmd is ready. sleep(Duration::from_millis(100)); // Send the prepare flashback msg. - let (result_tx, result_rx) = oneshot::channel(); - cluster.must_send_flashback_msg( - old_region.get_id(), - AdminCmdType::PrepareFlashback, - Callback::write(Box::new(move |resp| { - if resp.response.get_header().has_error() { - result_tx - .send(Some(resp.response.get_header().get_error().clone())) - .unwrap(); - return; - } - result_tx.send(None).unwrap(); - })), - ); + let resp = cluster.must_send_flashback_msg(old_region.get_id(), AdminCmdType::PrepareFlashback); // Remove the pause to make these two commands are in the same batch to apply. fail::remove(on_handle_apply_fp); - let prepare_flashback_err = block_on(result_rx).unwrap().unwrap(); + let prepare_flashback_err = block_on(async { + let resp = resp.await; + resp.get_header().get_error().clone() + }); assert!( prepare_flashback_err.has_epoch_not_match(), "prepare flashback should fail with epoch not match, but got {:?}", @@ -133,7 +144,9 @@ fn test_prepare_flashback_after_split() { must_check_flashback_state(&mut cluster, right_region.get_id(), 1, false); } -#[test] +// #[cfg(feature = "failpoints")] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_prepare_flashback_after_conf_change() { let mut cluster = new_node_cluster(0, 3); // Disable default max peer count check. @@ -150,23 +163,13 @@ fn test_prepare_flashback_after_conf_change() { // Make sure the conf change cmd is ready. sleep(Duration::from_millis(100)); // Send the prepare flashback msg. - let (result_tx, result_rx) = oneshot::channel(); - cluster.must_send_flashback_msg( - region_id, - AdminCmdType::PrepareFlashback, - Callback::write(Box::new(move |resp| { - if resp.response.get_header().has_error() { - result_tx - .send(Some(resp.response.get_header().get_error().clone())) - .unwrap(); - return; - } - result_tx.send(None).unwrap(); - })), - ); + let resp = cluster.must_send_flashback_msg(region_id, AdminCmdType::PrepareFlashback); // Remove the pause to make these two commands are in the same batch to apply. fail::remove(on_handle_apply_fp); - let prepare_flashback_err = block_on(result_rx).unwrap().unwrap(); + let prepare_flashback_err = block_on(async { + let resp = resp.await; + resp.get_header().get_error().clone() + }); assert!( prepare_flashback_err.has_epoch_not_match(), "prepare flashback should fail with epoch not match, but got {:?}", @@ -179,7 +182,8 @@ fn test_prepare_flashback_after_conf_change() { must_check_flashback_state(&mut cluster, region_id, 1, false); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_flashback_unprepared() { let mut cluster = new_node_cluster(0, 3); cluster.run(); @@ -193,9 +197,10 @@ fn test_flashback_unprepared() { ); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_flashback_for_schedule() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); cluster.run(); cluster.must_transfer_leader(1, new_peer(2, 2)); cluster.must_transfer_leader(1, new_peer(1, 1)); @@ -224,7 +229,8 @@ fn test_flashback_for_schedule() { cluster.must_transfer_leader(1, new_peer(2, 2)); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_flashback_for_write() { let mut cluster = new_node_cluster(0, 3); cluster.run(); @@ -259,7 +265,8 @@ fn test_flashback_for_write() { ); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_flashback_for_read() { let mut cluster = new_node_cluster(0, 3); cluster.run(); @@ -341,7 +348,8 @@ fn test_flashback_for_local_read() { must_request_with_flashback_flag(&mut cluster, &mut region, new_get_cmd(TEST_KEY)); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_flashback_for_status_cmd_as_region_detail() { let mut cluster = new_node_cluster(0, 3); cluster.run(); @@ -365,7 +373,8 @@ fn test_flashback_for_status_cmd_as_region_detail() { assert_eq!(region_detail.get_leader(), &leader); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_flashback_for_check_is_in_persist() { let mut cluster = new_node_cluster(0, 3); cluster.run(); @@ -381,7 +390,8 @@ fn test_flashback_for_check_is_in_persist() { must_check_flashback_state(&mut cluster, 1, 2, false); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_flashback_for_apply_snapshot() { let mut cluster = new_node_cluster(0, 3); configure_for_snapshot(&mut cluster.cfg); @@ -450,8 +460,67 @@ fn test_flashback_for_apply_snapshot() { ); } -fn must_check_flashback_state( - cluster: &mut Cluster, +trait ClusterI { + fn region_local_state(&self, region_id: u64, store_id: u64) -> RegionLocalState; + fn query_leader( + &self, + store_id: u64, + region_id: u64, + timeout: Duration, + ) -> Option; + fn call_command( + &self, + request: RaftCmdRequest, + timeout: Duration, + ) -> raftstore::Result; +} + +impl ClusterI for Cluster { + fn region_local_state(&self, region_id: u64, store_id: u64) -> RegionLocalState { + Cluster::::region_local_state(self, region_id, store_id) + } + fn query_leader( + &self, + store_id: u64, + region_id: u64, + timeout: Duration, + ) -> Option { + Cluster::::query_leader(self, store_id, region_id, timeout) + } + fn call_command( + &self, + request: RaftCmdRequest, + timeout: Duration, + ) -> raftstore::Result { + Cluster::::call_command(self, request, timeout) + } +} + +type ClusterV2 = + test_raftstore_v2::Cluster, RocksEngine>; +impl ClusterI for ClusterV2 { + fn region_local_state(&self, region_id: u64, store_id: u64) -> RegionLocalState { + ClusterV2::region_local_state(self, region_id, store_id) + } + fn query_leader( + &self, + store_id: u64, + region_id: u64, + timeout: Duration, + ) -> Option { + ClusterV2::query_leader(self, store_id, region_id, timeout) + } + fn call_command( + &self, + request: RaftCmdRequest, + timeout: Duration, + ) -> raftstore::Result { + ClusterV2::call_command(self, request, timeout) + } +} + +fn must_check_flashback_state( + cluster: &mut T, region_id: u64, store_id: u64, is_in_flashback: bool, @@ -473,8 +542,8 @@ fn must_check_flashback_state( ); } -fn request( - cluster: &mut Cluster, +fn request( + cluster: &mut T, region: &mut metapb::Region, req: Request, with_flashback_flag: bool, @@ -497,8 +566,8 @@ fn request( } // Make sure the request could be executed with flashback flag. -fn must_request_with_flashback_flag( - cluster: &mut Cluster, +fn must_request_with_flashback_flag( + cluster: &mut T, region: &mut metapb::Region, req: Request, ) { @@ -506,8 +575,8 @@ fn must_request_with_flashback_flag( assert!(!resp.get_header().has_error(), "{:?}", resp); } -fn must_get_flashback_not_prepared_error( - cluster: &mut Cluster, +fn must_get_flashback_not_prepared_error( + cluster: &mut T, region: &mut metapb::Region, req: Request, ) { @@ -516,8 +585,8 @@ fn must_get_flashback_not_prepared_error( } // Make sure the request could be executed without flashback flag. -fn must_request_without_flashback_flag( - cluster: &mut Cluster, +fn must_request_without_flashback_flag( + cluster: &mut T, region: &mut metapb::Region, req: Request, ) { @@ -525,8 +594,8 @@ fn must_request_without_flashback_flag( assert!(!resp.get_header().has_error(), "{:?}", resp); } -fn must_get_flashback_in_progress_error( - cluster: &mut Cluster, +fn must_get_flashback_in_progress_error( + cluster: &mut T, region: &mut metapb::Region, req: Request, ) { diff --git a/tests/integrations/raftstore/test_hibernate.rs b/tests/integrations/raftstore/test_hibernate.rs index 73156becb0d..86962330f0f 100644 --- a/tests/integrations/raftstore/test_hibernate.rs +++ b/tests/integrations/raftstore/test_hibernate.rs @@ -15,7 +15,7 @@ use tikv_util::{time::Instant, HandyRwLock}; #[test] fn test_proposal_prevent_sleep() { let mut cluster = new_node_cluster(0, 3); - configure_for_hibernate(&mut cluster); + configure_for_hibernate(&mut cluster.cfg); cluster.run(); cluster.must_transfer_leader(1, new_peer(1, 1)); cluster.must_put(b"k1", b"v1"); @@ -62,7 +62,7 @@ fn test_proposal_prevent_sleep() { true, ); request.mut_header().set_peer(new_peer(1, 1)); - let (cb, rx) = make_cb(&request); + let (cb, mut rx) = make_cb(&request); // send to peer 2 cluster .sim @@ -108,7 +108,7 @@ fn test_proposal_prevent_sleep() { #[test] fn test_single_voter_restart() { let mut cluster = new_server_cluster(0, 2); - configure_for_hibernate(&mut cluster); + configure_for_hibernate(&mut cluster.cfg); cluster.pd_client.disable_default_operator(); cluster.run_conf_change(); cluster.pd_client.must_add_peer(1, new_learner_peer(2, 2)); @@ -127,7 +127,7 @@ fn test_single_voter_restart() { #[test] fn test_prompt_learner() { let mut cluster = new_server_cluster(0, 4); - configure_for_hibernate(&mut cluster); + configure_for_hibernate(&mut cluster.cfg); cluster.cfg.raft_store.raft_log_gc_count_limit = Some(20); cluster.pd_client.disable_default_operator(); cluster.run_conf_change(); @@ -169,7 +169,7 @@ fn test_prompt_learner() { #[test] fn test_transfer_leader_delay() { let mut cluster = new_node_cluster(0, 3); - configure_for_hibernate(&mut cluster); + configure_for_hibernate(&mut cluster.cfg); cluster.run(); cluster.must_transfer_leader(1, new_peer(1, 1)); cluster.must_put(b"k1", b"v1"); @@ -237,7 +237,7 @@ fn test_transfer_leader_delay() { #[test] fn test_split_delay() { let mut cluster = new_server_cluster(0, 4); - configure_for_hibernate(&mut cluster); + configure_for_hibernate(&mut cluster.cfg); cluster.cfg.raft_store.raft_log_gc_count_limit = Some(20); cluster.pd_client.disable_default_operator(); cluster.run_conf_change(); @@ -277,7 +277,7 @@ fn test_split_delay() { #[test] fn test_inconsistent_configuration() { let mut cluster = new_node_cluster(0, 3); - configure_for_hibernate(&mut cluster); + configure_for_hibernate(&mut cluster.cfg); cluster.run(); cluster.must_transfer_leader(1, new_peer(1, 1)); cluster.must_put(b"k1", b"v1"); @@ -362,7 +362,7 @@ fn test_inconsistent_configuration() { fn test_hibernate_feature_gate() { let mut cluster = new_node_cluster(0, 3); cluster.pd_client.reset_version("4.0.0"); - configure_for_hibernate(&mut cluster); + configure_for_hibernate(&mut cluster.cfg); cluster.run(); cluster.must_transfer_leader(1, new_peer(1, 1)); cluster.must_put(b"k1", b"v1"); @@ -411,7 +411,7 @@ fn test_hibernate_feature_gate() { #[test] fn test_leader_demoted_when_hibernated() { let mut cluster = new_node_cluster(0, 4); - configure_for_hibernate(&mut cluster); + configure_for_hibernate(&mut cluster.cfg); cluster.pd_client.disable_default_operator(); let r = cluster.run_conf_change(); cluster.pd_client.must_add_peer(r, new_peer(2, 2)); diff --git a/tests/integrations/raftstore/test_joint_consensus.rs b/tests/integrations/raftstore/test_joint_consensus.rs index 7845ecec43d..282d0d0525c 100644 --- a/tests/integrations/raftstore/test_joint_consensus.rs +++ b/tests/integrations/raftstore/test_joint_consensus.rs @@ -1,9 +1,6 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - sync::{mpsc, Arc}, - time::*, -}; +use std::{sync::Arc, time::*}; use kvproto::{ metapb::{self, PeerRole, Region}, @@ -13,7 +10,7 @@ use pd_client::PdClient; use raft::eraftpb::ConfChangeType; use raftstore::Result; use test_raftstore::*; -use tikv_util::store::find_peer; +use tikv_util::{mpsc::future, store::find_peer}; /// Tests multiple confchange commands can be done by one request #[test] @@ -167,23 +164,23 @@ fn test_request_in_joint_state() { // Isolated peer 2, so the old configuation can't reach quorum cluster.add_send_filter(IsolationFilterFactory::new(2)); - let rx = cluster + let mut rx = cluster .async_request(put_request(®ion, 1, b"k3", b"v3")) .unwrap(); assert_eq!( rx.recv_timeout(Duration::from_millis(100)), - Err(mpsc::RecvTimeoutError::Timeout) + Err(future::RecvTimeoutError::Timeout) ); cluster.clear_send_filters(); // Isolated peer 3, so the new configuation can't reach quorum cluster.add_send_filter(IsolationFilterFactory::new(3)); - let rx = cluster + let mut rx = cluster .async_request(put_request(®ion, 1, b"k4", b"v4")) .unwrap(); assert_eq!( rx.recv_timeout(Duration::from_millis(100)), - Err(mpsc::RecvTimeoutError::Timeout) + Err(future::RecvTimeoutError::Timeout) ); cluster.clear_send_filters(); diff --git a/tests/integrations/raftstore/test_lease_read.rs b/tests/integrations/raftstore/test_lease_read.rs index 8ac364faae9..e1905c99476 100644 --- a/tests/integrations/raftstore/test_lease_read.rs +++ b/tests/integrations/raftstore/test_lease_read.rs @@ -16,7 +16,7 @@ use pd_client::PdClient; use raft::eraftpb::{ConfChangeType, MessageType}; use raftstore::store::{Callback, RegionSnapshot}; use test_raftstore::*; -use tikv_util::{config::*, time::Instant, HandyRwLock}; +use tikv_util::{config::*, future::block_on_timeout, time::Instant, HandyRwLock}; // A helper function for testing the lease reads and lease renewing. // The leader keeps a record of its leader lease, and uses the system's @@ -430,7 +430,7 @@ fn test_node_callback_when_destroyed() { let get = new_get_cmd(b"k1"); let mut req = new_request(1, epoch, vec![get], true); req.mut_header().set_peer(leader); - let (cb, rx) = make_cb(&req); + let (cb, mut rx) = make_cb(&req); cluster .sim .rl() @@ -500,8 +500,8 @@ fn test_read_index_stale_in_suspect_lease() { cluster.must_put(b"k2", b"v2"); must_get_equal(&cluster.get_engine(3), b"k2", b"v2"); // Ensure peer 3 is ready to become leader. - let rx = async_read_on_peer(&mut cluster, new_peer(3, 3), r1.clone(), b"k2", true, true); - let resp = rx.recv_timeout(Duration::from_secs(3)).unwrap(); + let resp_ch = async_read_on_peer(&mut cluster, new_peer(3, 3), r1.clone(), b"k2", true, true); + let resp = block_on_timeout(resp_ch, Duration::from_secs(3)).unwrap(); assert!(!resp.get_header().has_error(), "{:?}", resp); assert_eq!( resp.get_responses()[0].get_get().get_value(), @@ -649,7 +649,7 @@ fn test_not_leader_read_lease() { true, ); req.mut_header().set_peer(new_peer(1, 1)); - let (cb, rx) = make_cb(&req); + let (cb, mut rx) = make_cb(&req); cluster.sim.rl().async_command_on_node(1, req, cb).unwrap(); cluster.must_transfer_leader(region_id, new_peer(3, 3)); @@ -716,7 +716,7 @@ fn test_read_index_after_write() { ); req.mut_header() .set_peer(new_peer(1, region_on_store1.get_id())); - let (cb, rx) = make_cb(&req); + let (cb, mut rx) = make_cb(&req); cluster.sim.rl().async_command_on_node(1, req, cb).unwrap(); cluster.sim.wl().clear_recv_filters(2); diff --git a/tests/integrations/raftstore/test_merge.rs b/tests/integrations/raftstore/test_merge.rs index 404cb418d33..dda92230ec8 100644 --- a/tests/integrations/raftstore/test_merge.rs +++ b/tests/integrations/raftstore/test_merge.rs @@ -1445,10 +1445,10 @@ fn test_merge_pessimistic_locks_when_gap_is_too_large() { // The gap is too large, so the previous merge should fail. And this new put // request should be allowed. - let res = cluster.async_put(b"k1", b"new_val").unwrap(); + let mut res = cluster.async_put(b"k1", b"new_val").unwrap(); cluster.clear_send_filters(); - res.recv().unwrap(); + res.recv_timeout(Duration::from_secs(5)).unwrap(); assert_eq!(cluster.must_get(b"k1").unwrap(), b"new_val"); } diff --git a/tests/integrations/raftstore/test_replica_read.rs b/tests/integrations/raftstore/test_replica_read.rs index 0359bacf436..40189017645 100644 --- a/tests/integrations/raftstore/test_replica_read.rs +++ b/tests/integrations/raftstore/test_replica_read.rs @@ -12,13 +12,15 @@ use std::{ time::Duration, }; -use futures::{compat::Future01CompatExt, executor::block_on, FutureExt}; +use futures::executor::block_on; use kvproto::raft_serverpb::RaftMessage; use pd_client::PdClient; use raft::eraftpb::MessageType; use raftstore::{store::ReadIndexContext, Result}; -use test_raftstore::*; -use tikv_util::{config::*, time::Instant, timer::GLOBAL_TIMER_HANDLE, HandyRwLock}; +use test_raftstore::{Simulator as S1, *}; +use test_raftstore_macro::test_case; +use test_raftstore_v2::Simulator as S2; +use tikv_util::{config::*, future::block_on_timeout, time::Instant, HandyRwLock}; use txn_types::{Key, Lock, LockType}; use uuid::Uuid; @@ -53,9 +55,10 @@ impl Filter for CommitToFilter { } } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_replica_read_not_applied() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); // Increase the election tick to make this test case running reliably. configure_for_lease_read(&mut cluster.cfg, Some(50), Some(30)); @@ -100,27 +103,29 @@ fn test_replica_read_not_applied() { let r1 = cluster.get_region(b"k1"); // Read index on follower should be blocked instead of get an old value. - let resp1_ch = async_read_on_peer(&mut cluster, new_peer(3, 3), r1.clone(), b"k1", true, true); - resp1_ch.recv_timeout(Duration::from_secs(1)).unwrap_err(); + let mut resp1_ch = + async_read_on_peer(&mut cluster, new_peer(3, 3), r1.clone(), b"k1", true, true); + block_on_timeout(resp1_ch.as_mut(), Duration::from_secs(1)).unwrap_err(); // Unpark all append responses so that the new leader can commit its first // entry. let router = cluster.sim.wl().get_router(2).unwrap(); for raft_msg in mem::take::>(dropped_msgs.lock().unwrap().as_mut()) { - router.send_raft_message(raft_msg).unwrap(); + #[allow(clippy::useless_conversion)] + router.send_raft_message(raft_msg.into()).unwrap(); } // The old read index request won't be blocked forever as it's retried // internally. cluster.sim.wl().clear_send_filters(1); cluster.sim.wl().clear_recv_filters(2); - let resp1 = resp1_ch.recv_timeout(Duration::from_secs(6)).unwrap(); + let resp1 = block_on_timeout(resp1_ch, Duration::from_secs(6)).unwrap(); let exp_value = resp1.get_responses()[0].get_get().get_value(); assert_eq!(exp_value, b"v2"); // New read index requests can be resolved quickly. let resp2_ch = async_read_on_peer(&mut cluster, new_peer(3, 3), r1, b"k1", true, true); - let resp2 = resp2_ch.recv_timeout(Duration::from_secs(3)).unwrap(); + let resp2 = block_on_timeout(resp2_ch, Duration::from_secs(3)).unwrap(); let exp_value = resp2.get_responses()[0].get_get().get_value(); assert_eq!(exp_value, b"v2"); } @@ -150,8 +155,8 @@ fn test_replica_read_on_hibernate() { let r1 = cluster.get_region(b"k1"); // Read index on follower should be blocked. - let resp1_ch = async_read_on_peer(&mut cluster, new_peer(1, 1), r1, b"k1", true, true); - resp1_ch.recv_timeout(Duration::from_secs(1)).unwrap_err(); + let mut resp1_ch = async_read_on_peer(&mut cluster, new_peer(1, 1), r1, b"k1", true, true); + block_on_timeout(resp1_ch.as_mut(), Duration::from_secs(1)).unwrap_err(); let (tx, rx) = mpsc::sync_channel(1024); let cb = Arc::new(move |msg: &RaftMessage| { @@ -220,7 +225,7 @@ fn test_read_hibernated_region() { cluster.pd_client.trigger_leader_info_loss(); // This request will fail because no valid leader. let resp1_ch = async_read_on_peer(&mut cluster, p2.clone(), region.clone(), b"k1", true, true); - let resp1 = resp1_ch.recv_timeout(Duration::from_secs(5)).unwrap(); + let resp1 = block_on_timeout(resp1_ch, Duration::from_secs(5)).unwrap(); assert!( resp1.get_header().get_error().has_not_leader(), "{:?}", @@ -243,16 +248,17 @@ fn test_read_hibernated_region() { // Wait for the leader is woken up. thread::sleep(Duration::from_millis(500)); let resp2_ch = async_read_on_peer(&mut cluster, p2, region, b"k1", true, true); - let resp2 = resp2_ch.recv_timeout(Duration::from_secs(5)).unwrap(); + let resp2 = block_on_timeout(resp2_ch, Duration::from_secs(5)).unwrap(); assert!(!resp2.get_header().has_error(), "{:?}", resp2); } /// The read index response can advance the commit index. -/// But in previous implemtation, we forget to set term in read index response +/// But in previous implementation, we forget to set term in read index response /// which causes panic in raft-rs. This test is to reproduce the case. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_replica_read_on_stale_peer() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); configure_for_lease_read(&mut cluster.cfg, Some(50), Some(30)); let pd_client = Arc::clone(&cluster.pd_client); @@ -278,14 +284,13 @@ fn test_replica_read_on_stale_peer() { cluster.must_put(b"k2", b"v2"); let resp1_ch = async_read_on_peer(&mut cluster, peer_on_store3, region, b"k2", true, true); // must be timeout - resp1_ch - .recv_timeout(Duration::from_micros(100)) - .unwrap_err(); + block_on_timeout(resp1_ch, Duration::from_micros(100)).unwrap_err(); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_read_index_out_of_order() { - let mut cluster = new_node_cluster(0, 2); + let mut cluster = new_cluster(0, 2); // Use long election timeout and short lease. configure_for_lease_read(&mut cluster.cfg, Some(1000), Some(10)); @@ -312,20 +317,21 @@ fn test_read_index_out_of_order() { // Can't get read resonse because heartbeat responses are blocked. let r1 = cluster.get_region(b"k1"); - let resp1 = async_read_on_peer(&mut cluster, new_peer(1, 1), r1.clone(), b"k1", true, true); - resp1.recv_timeout(Duration::from_secs(2)).unwrap_err(); + let mut resp1 = async_read_on_peer(&mut cluster, new_peer(1, 1), r1.clone(), b"k1", true, true); + block_on_timeout(resp1.as_mut(), Duration::from_secs(2)).unwrap_err(); pd_client.must_remove_peer(rid, new_peer(2, 2)); // After peer 2 is removed, we can get 2 read responses. let resp2 = async_read_on_peer(&mut cluster, new_peer(1, 1), r1, b"k1", true, true); - resp2.recv_timeout(Duration::from_secs(1)).unwrap(); - resp1.recv_timeout(Duration::from_secs(1)).unwrap(); + block_on_timeout(resp2, Duration::from_secs(1)).unwrap(); + block_on_timeout(resp1, Duration::from_secs(1)).unwrap(); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_read_index_retry_lock_checking() { - let mut cluster = new_node_cluster(0, 2); + let mut cluster = new_cluster(0, 2); // Use long election timeout and short lease. configure_for_lease_read(&mut cluster.cfg, Some(50), Some(20)); @@ -353,10 +359,10 @@ fn test_read_index_retry_lock_checking() { // Can't get response because read index responses are blocked. let r1 = cluster.get_region(b"k1"); - let resp1 = async_read_index_on_peer(&mut cluster, new_peer(2, 2), r1.clone(), b"k1", true); - let resp2 = async_read_index_on_peer(&mut cluster, new_peer(2, 2), r1, b"k2", true); - resp1.recv_timeout(Duration::from_secs(2)).unwrap_err(); - resp2.try_recv().unwrap_err(); + let mut resp1 = async_read_index_on_peer(&mut cluster, new_peer(2, 2), r1.clone(), b"k1", true); + let mut resp2 = async_read_index_on_peer(&mut cluster, new_peer(2, 2), r1, b"k2", true); + block_on_timeout(resp1.as_mut(), Duration::from_secs(2)).unwrap_err(); + block_on_timeout(resp2.as_mut(), Duration::from_millis(1)).unwrap_err(); // k1 has a memory lock let leader_cm = cluster.sim.rl().get_concurrency_manager(1); @@ -378,30 +384,31 @@ fn test_read_index_retry_lock_checking() { cluster.sim.wl().clear_recv_filters(2); // resp1 should contain key is locked error assert!( - resp1 - .recv_timeout(Duration::from_secs(2)) + block_on_timeout(resp1, Duration::from_secs(2)) .unwrap() .responses[0] .get_read_index() .has_locked() ); // resp2 should has a successful read index + let resp = block_on_timeout(resp2, Duration::from_secs(2)).unwrap(); assert!( - resp2 - .recv_timeout(Duration::from_secs(2)) - .unwrap() - .responses[0] - .get_read_index() - .get_read_index() - > 0 + !resp.get_header().has_error() + && resp + .get_responses() + .get(0) + .map_or(true, |r| !r.get_read_index().has_locked()), + "{:?}", + resp, ); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_split_isolation() { - let mut cluster = new_node_cluster(0, 2); + let mut cluster = new_cluster(0, 2); // Use long election timeout and short lease. - configure_for_hibernate(&mut cluster); + configure_for_hibernate(&mut cluster.cfg); configure_for_lease_read(&mut cluster.cfg, Some(50), Some(20)); cluster.cfg.raft_store.raft_log_gc_count_limit = Some(11); let pd_client = Arc::clone(&cluster.pd_client); @@ -446,7 +453,7 @@ fn test_split_isolation() { // cannot be created. for _ in 0..10 { let resp = async_read_on_peer(&mut cluster, peer.clone(), r2.clone(), b"k1", true, true); - let resp = resp.recv_timeout(Duration::from_secs(1)).unwrap(); + let resp = block_on_timeout(resp, Duration::from_secs(1)).unwrap(); if !resp.get_header().has_error() { return; } @@ -458,9 +465,10 @@ fn test_split_isolation() { /// Testing after applying snapshot, the `ReadDelegate` stored at `StoreMeta` /// will be replace with the new `ReadDelegate`, and the `ReadDelegate` stored /// at `LocalReader` should also be updated -#[test] -fn test_read_local_after_snapshpot_replace_peer() { - let mut cluster = new_node_cluster(0, 3); +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] +fn test_read_local_after_snapshot_replace_peer() { + let mut cluster = new_cluster(0, 3); configure_for_lease_read(&mut cluster.cfg, Some(50), None); cluster.cfg.raft_store.raft_log_gc_threshold = 12; cluster.cfg.raft_store.raft_log_gc_count_limit = Some(12); @@ -484,7 +492,7 @@ fn test_read_local_after_snapshpot_replace_peer() { // wait applying snapshot finish sleep_ms(100); let resp = async_read_on_peer(&mut cluster, new_peer(3, 3), r, b"k1", true, true); - let resp = resp.recv_timeout(Duration::from_secs(1)).unwrap(); + let resp = block_on_timeout(resp, Duration::from_secs(1)).unwrap(); assert_eq!(resp.get_responses()[0].get_get().get_value(), b"v1"); // trigger leader send snapshot to peer 3 @@ -513,10 +521,10 @@ fn test_read_local_after_snapshpot_replace_peer() { let r = cluster.get_region(b"k1"); let resp = async_read_on_peer(&mut cluster, new_peer(3, 1003), r, b"k3", true, true); - let resp = resp.recv_timeout(Duration::from_secs(1)).unwrap(); + let resp = block_on_timeout(resp, Duration::from_secs(1)).unwrap(); // should not have `mismatch peer id` error if resp.get_header().has_error() { - panic!("unexpect err: {:?}", resp.get_header().get_error()); + panic!("unexpected err: {:?}", resp.get_header().get_error()); } let exp_value = resp.get_responses()[0].get_get().get_value(); assert_eq!(exp_value, b"v3"); @@ -524,9 +532,10 @@ fn test_read_local_after_snapshpot_replace_peer() { /// The case checks if a malformed request should not corrupt the leader's read /// queue. -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_malformed_read_index() { - let mut cluster = new_node_cluster(0, 3); + let mut cluster = new_cluster(0, 3); configure_for_lease_read(&mut cluster.cfg, Some(50), None); cluster.cfg.raft_store.raft_log_gc_threshold = 12; cluster.cfg.raft_store.raft_log_gc_count_limit = Some(12); @@ -580,82 +589,6 @@ fn test_malformed_read_index() { // the read queue, the correct request should be responded. let resp = async_read_on_peer(&mut cluster, new_peer(1, 1), region, b"k1", true, false); cluster.clear_send_filters(); - let resp = resp.recv_timeout(Duration::from_secs(10)).unwrap(); - assert_eq!(resp.get_responses()[0].get_get().get_value(), b"v1"); -} - -/// The case checks if a malformed request should not corrupt the leader's read -/// queue. -#[test] -fn test_malformed_read_index_v2() { - use test_raftstore_v2::*; - - let mut cluster = new_node_cluster(0, 3); - configure_for_lease_read(&mut cluster.cfg, Some(50), None); - cluster.cfg.raft_store.raft_log_gc_threshold = 12; - cluster.cfg.raft_store.raft_log_gc_count_limit = Some(12); - cluster.cfg.raft_store.hibernate_regions = true; - cluster.cfg.raft_store.check_leader_lease_interval = ReadableDuration::hours(10); - let pd_client = Arc::clone(&cluster.pd_client); - pd_client.disable_default_operator(); - - let region_id = cluster.run_conf_change(); - pd_client.must_add_peer(region_id, new_peer(2, 2)); - pd_client.must_add_peer(region_id, new_peer(3, 3)); - cluster.must_transfer_leader(1, new_peer(1, 1)); - cluster.must_put(b"k1", b"v1"); - for i in 1..=3 { - must_get_equal(&cluster.get_engine(i), b"k1", b"v1"); - } - - // Wait till lease expires. - std::thread::sleep( - cluster - .cfg - .raft_store - .raft_store_max_leader_lease() - .to_std() - .unwrap(), - ); - let region = cluster.get_region(b"k1"); - // Send a malformed request to leader - let mut raft_msg = raft::eraftpb::Message::default(); - raft_msg.set_msg_type(MessageType::MsgReadIndex); - let rctx = ReadIndexContext { - id: Uuid::new_v4(), - request: None, - locked: None, - }; - let mut e = raft::eraftpb::Entry::default(); - e.set_data(rctx.to_bytes().into()); - raft_msg.mut_entries().push(e); - raft_msg.from = 1; - raft_msg.to = 1; - let mut message = RaftMessage::default(); - message.set_region_id(region_id); - message.set_from_peer(new_peer(1, 1)); - message.set_to_peer(new_peer(1, 1)); - message.set_region_epoch(region.get_region_epoch().clone()); - message.set_message(raft_msg); - // So the read won't be handled soon. - cluster.add_send_filter(IsolationFilterFactory::new(1)); - cluster.send_raft_msg(message).unwrap(); - // Also send a correct request. If the malformed request doesn't corrupt - // the read queue, the correct request should be responded. - let resp = async_read_on_peer(&mut cluster, new_peer(1, 1), region, b"k1", true, false); - cluster.clear_send_filters(); - - let timeout = Duration::from_secs(10); - let timeout_f = GLOBAL_TIMER_HANDLE - .delay(std::time::Instant::now() + timeout) - .compat(); - let resp = futures::executor::block_on(async move { - futures::select! { - res = resp.fuse() => res.unwrap(), - e = timeout_f.fuse() => { - panic!("request timeout for {:?}: {:?}", timeout,e); - }, - } - }); + let resp = block_on_timeout(resp, Duration::from_secs(10)).unwrap(); assert_eq!(resp.get_responses()[0].get_get().get_value(), b"v1"); } diff --git a/tests/integrations/raftstore/test_replication_mode.rs b/tests/integrations/raftstore/test_replication_mode.rs index d20249bc53f..cb7de1fad35 100644 --- a/tests/integrations/raftstore/test_replication_mode.rs +++ b/tests/integrations/raftstore/test_replication_mode.rs @@ -1,16 +1,12 @@ // Copyright 2020 TiKV Project Authors. Licensed under Apache-2.0. -use std::{ - sync::{mpsc, Arc}, - thread, - time::Duration, -}; +use std::{sync::Arc, thread, time::Duration}; use kvproto::replication_modepb::*; use pd_client::PdClient; use raft::eraftpb::ConfChangeType; use test_raftstore::*; -use tikv_util::{config::*, HandyRwLock}; +use tikv_util::{config::*, mpsc::future, HandyRwLock}; fn prepare_cluster() -> Cluster { let mut cluster = new_server_cluster(0, 3); @@ -53,7 +49,7 @@ fn test_dr_auto_sync() { false, ); request.mut_header().set_peer(new_peer(1, 1)); - let (cb, rx) = make_cb(&request); + let (cb, mut rx) = make_cb(&request); cluster .sim .rl() @@ -75,7 +71,7 @@ fn test_dr_auto_sync() { false, ); request.mut_header().set_peer(new_peer(1, 1)); - let (cb, rx) = make_cb(&request); + let (cb, mut rx) = make_cb(&request); cluster .sim .rl() @@ -83,7 +79,7 @@ fn test_dr_auto_sync() { .unwrap(); assert_eq!( rx.recv_timeout(Duration::from_millis(100)), - Err(mpsc::RecvTimeoutError::Timeout) + Err(future::RecvTimeoutError::Timeout) ); must_get_none(&cluster.get_engine(1), b"k2"); let state = cluster.pd_client.region_replication_status(region.get_id()); @@ -105,7 +101,7 @@ fn test_sync_recover_after_apply_snapshot() { false, ); request.mut_header().set_peer(new_peer(1, 1)); - let (cb, rx) = make_cb(&request); + let (cb, mut rx) = make_cb(&request); cluster .sim .rl() @@ -113,7 +109,7 @@ fn test_sync_recover_after_apply_snapshot() { .unwrap(); assert_eq!( rx.recv_timeout(Duration::from_millis(100)), - Err(mpsc::RecvTimeoutError::Timeout) + Err(future::RecvTimeoutError::Timeout) ); must_get_none(&cluster.get_engine(1), b"k2"); let state = cluster.pd_client.region_replication_status(region.get_id()); @@ -252,7 +248,7 @@ fn test_switching_replication_mode() { false, ); request.mut_header().set_peer(new_peer(1, 1)); - let (cb, rx) = make_cb(&request); + let (cb, mut rx) = make_cb(&request); cluster .sim .rl() @@ -260,7 +256,7 @@ fn test_switching_replication_mode() { .unwrap(); assert_eq!( rx.recv_timeout(Duration::from_millis(100)), - Err(mpsc::RecvTimeoutError::Timeout) + Err(future::RecvTimeoutError::Timeout) ); must_get_none(&cluster.get_engine(1), b"k2"); let state = cluster.pd_client.region_replication_status(region.get_id()); @@ -288,7 +284,7 @@ fn test_switching_replication_mode() { false, ); request.mut_header().set_peer(new_peer(1, 1)); - let (cb, rx) = make_cb(&request); + let (cb, mut rx) = make_cb(&request); cluster .sim .rl() @@ -296,7 +292,7 @@ fn test_switching_replication_mode() { .unwrap(); assert_eq!( rx.recv_timeout(Duration::from_millis(100)), - Err(mpsc::RecvTimeoutError::Timeout) + Err(future::RecvTimeoutError::Timeout) ); must_get_none(&cluster.get_engine(1), b"k3"); let state = cluster.pd_client.region_replication_status(region.get_id()); @@ -329,7 +325,7 @@ fn test_replication_mode_allowlist() { false, ); request.mut_header().set_peer(new_peer(1, 1)); - let (cb, rx) = make_cb(&request); + let (cb, mut rx) = make_cb(&request); cluster .sim .rl() @@ -337,7 +333,7 @@ fn test_replication_mode_allowlist() { .unwrap(); assert_eq!( rx.recv_timeout(Duration::from_millis(100)), - Err(mpsc::RecvTimeoutError::Timeout) + Err(future::RecvTimeoutError::Timeout) ); // clear allowlist. @@ -417,7 +413,7 @@ fn test_migrate_replication_mode() { false, ); request.mut_header().set_peer(new_peer(1, 1)); - let (cb, rx) = make_cb(&request); + let (cb, mut rx) = make_cb(&request); cluster .sim .rl() @@ -425,7 +421,7 @@ fn test_migrate_replication_mode() { .unwrap(); assert_eq!( rx.recv_timeout(Duration::from_millis(100)), - Err(mpsc::RecvTimeoutError::Timeout) + Err(future::RecvTimeoutError::Timeout) ); must_get_none(&cluster.get_engine(1), b"k2"); let state = cluster.pd_client.region_replication_status(region.get_id()); diff --git a/tests/integrations/raftstore/test_single.rs b/tests/integrations/raftstore/test_single.rs index 7fedc3c1cd4..d21a134a0c3 100644 --- a/tests/integrations/raftstore/test_single.rs +++ b/tests/integrations/raftstore/test_single.rs @@ -2,51 +2,15 @@ use std::time::Duration; -use engine_traits::{CfName, CF_DEFAULT, CF_WRITE}; +use engine_traits::{CF_DEFAULT, CF_WRITE}; use raftstore::store::RAFT_INIT_LOG_INDEX; use rand::prelude::*; -use test_raftstore::{new_put_cf_cmd, new_put_cmd, new_request, sleep_ms}; +use test_raftstore::{new_put_cmd, new_request, sleep_ms}; use test_raftstore_macro::test_case; use tikv_util::{config::*, time::Instant}; // TODO add epoch not match test cases. -fn test_delete_range( - cluster: &mut test_raftstore::Cluster, - cf: CfName, -) { - let data_set: Vec<_> = (1..500) - .map(|i| { - ( - format!("key{:08}", i).into_bytes(), - format!("value{}", i).into_bytes(), - ) - }) - .collect(); - for kvs in data_set.chunks(50) { - let requests = kvs.iter().map(|(k, v)| new_put_cf_cmd(cf, k, v)).collect(); - // key9 is always the last region. - cluster.batch_put(b"key9", requests).unwrap(); - } - - // delete_range request with notify_only set should not actually delete data. - cluster.must_notify_delete_range_cf(cf, b"", b""); - - let mut rng = rand::thread_rng(); - for _ in 0..50 { - let (k, v) = data_set.choose(&mut rng).unwrap(); - assert_eq!(cluster.get_cf(cf, k).unwrap(), *v); - } - - // Empty keys means the whole range. - cluster.must_delete_range_cf(cf, b"", b""); - - for _ in 0..50 { - let k = &data_set.choose(&mut rng).unwrap().0; - assert!(cluster.get_cf(cf, k).is_none()); - } -} - #[test_case(test_raftstore::new_node_cluster)] #[test_case(test_raftstore::new_server_cluster)] #[test_case(test_raftstore_v2::new_node_cluster)] @@ -127,9 +91,10 @@ fn test_delete() { } } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_node_use_delete_range() { - let mut cluster = test_raftstore::new_node_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.cfg.raft_store.use_delete_range = true; cluster.run(); test_delete_range(&mut cluster, CF_DEFAULT); @@ -137,9 +102,10 @@ fn test_node_use_delete_range() { test_delete_range(&mut cluster, CF_WRITE); } -#[test] +#[test_case(test_raftstore::new_node_cluster)] +#[test_case(test_raftstore_v2::new_node_cluster)] fn test_node_not_use_delete_range() { - let mut cluster = test_raftstore::new_node_cluster(0, 1); + let mut cluster = new_cluster(0, 1); cluster.cfg.raft_store.use_delete_range = false; cluster.run(); test_delete_range(&mut cluster, CF_DEFAULT); diff --git a/tests/integrations/raftstore/test_split_region.rs b/tests/integrations/raftstore/test_split_region.rs index 48b226ba40e..071856cbd29 100644 --- a/tests/integrations/raftstore/test_split_region.rs +++ b/tests/integrations/raftstore/test_split_region.rs @@ -9,7 +9,11 @@ use std::{ use engine_traits::{Peekable, CF_DEFAULT, CF_WRITE}; use keys::data_key; -use kvproto::{metapb, pdpb, raft_cmdpb::*, raft_serverpb::RaftMessage}; +use kvproto::{ + metapb, pdpb, + raft_cmdpb::*, + raft_serverpb::{ExtraMessageType, RaftMessage}, +}; use pd_client::PdClient; use raft::eraftpb::MessageType; use raftstore::{ @@ -1259,3 +1263,39 @@ fn test_catch_up_peers_after_split() { assert!(!pending_peers.contains_key(&p.id)) } } + +#[test] +fn test_split_region_keep_records() { + let mut cluster = test_raftstore_v2::new_node_cluster(0, 3); + let pd_client = Arc::clone(&cluster.pd_client); + pd_client.disable_default_operator(); + let r1 = cluster.run_conf_change(); + cluster.must_put(b"k1", b"v1"); + pd_client.must_add_peer(r1, new_peer(2, 2)); + must_get_equal(&cluster.get_engine(2), b"k1", b"v1"); + pd_client.must_remove_peer(r1, new_peer(2, 2)); + + let leader = cluster.leader_of_region(r1).unwrap(); + cluster.add_send_filter_on_node( + leader.get_store_id(), + Box::new(DropMessageFilter::new(Arc::new(|m: &RaftMessage| { + // Drop all gc peer requests and responses. + !(m.has_extra_msg() + && (m.get_extra_msg().get_type() == ExtraMessageType::MsgGcPeerRequest + || m.get_extra_msg().get_type() == ExtraMessageType::MsgGcPeerResponse)) + }))), + ); + + // Make sure split has applied. + let region = pd_client.get_region(b"").unwrap(); + cluster.must_split(®ion, b"k1"); + cluster.must_put(b"k2", b"v2"); + cluster.must_put(b"k0", b"v0"); + + let region_state = cluster.region_local_state(r1, leader.get_store_id()); + assert!( + !region_state.get_removed_records().is_empty(), + "{:?}", + region_state + ); +} diff --git a/tests/integrations/raftstore/test_tombstone.rs b/tests/integrations/raftstore/test_tombstone.rs index 3d7fc235cad..972a75212b4 100644 --- a/tests/integrations/raftstore/test_tombstone.rs +++ b/tests/integrations/raftstore/test_tombstone.rs @@ -80,7 +80,7 @@ fn test_tombstone(cluster: &mut Cluster) { raft_msg.set_region_id(r1); // Use an invalid from peer to ignore gc peer message. - raft_msg.set_from_peer(new_peer(0, 0)); + raft_msg.set_from_peer(new_peer(100, 100)); raft_msg.set_to_peer(new_peer(2, 2)); raft_msg.mut_region_epoch().set_conf_ver(0); raft_msg.mut_region_epoch().set_version(0); diff --git a/tests/integrations/server/debugger.rs b/tests/integrations/server/debugger.rs new file mode 100644 index 00000000000..e8a7bccb052 --- /dev/null +++ b/tests/integrations/server/debugger.rs @@ -0,0 +1,163 @@ +// Copyright 2023 TiKV Project Authors. Licensed under Apache-2.0. + +use collections::{HashMap, HashSet}; +use engine_rocks::{raw::Range, util::get_cf_handle}; +use engine_traits::{CachedTablet, MiscExt, CF_WRITE}; +use keys::{data_key, DATA_MAX_KEY}; +use kvproto::debugpb::Db; +use tikv::{ + config::ConfigController, + server::{debug::Debugger, debug2::DebuggerImplV2}, + storage::mvcc::{TimeStamp, Write, WriteType}, +}; +use txn_types::Key; + +fn gen_mvcc_put_kv( + k: &[u8], + v: &[u8], + start_ts: TimeStamp, + commit_ts: TimeStamp, +) -> (Vec, Vec) { + let k = Key::from_encoded(data_key(k)); + let k = k.append_ts(commit_ts); + let w = Write::new(WriteType::Put, start_ts, Some(v.to_vec())); + (k.as_encoded().clone(), w.as_ref().to_bytes()) +} + +fn gen_delete_k(k: &[u8], commit_ts: TimeStamp) -> Vec { + let k = Key::from_encoded(data_key(k)); + let k = k.append_ts(commit_ts); + k.as_encoded().clone() +} + +#[test] +fn test_compact() { + let (split_key, _) = gen_mvcc_put_kv(b"k10", b"", 1.into(), 2.into()); + let (split_key2, _) = gen_mvcc_put_kv(b"k20", b"", 1.into(), 2.into()); + let regions = vec![ + (1, b"".to_vec(), split_key.clone()), + (1000, split_key.clone(), split_key2.clone()), + (1002, split_key2.clone(), b"".to_vec()), + ]; + + let check_compact = |from: Vec, to: Vec, regions_compacted: HashSet| { + let count = 1; + let mut cluster = test_raftstore_v2::new_node_cluster(0, count); + cluster.cfg.raft_store.right_derive_when_split = false; + cluster.run(); + + let region = cluster.get_region(b""); + cluster.must_split(®ion, &split_key); + let region = cluster.get_region(&split_key); + cluster.must_split(®ion, &split_key2); + + for i in 0..30 { + let (k, v) = (format!("k{:02}", i), format!("value{}", i)); + let (k, v) = gen_mvcc_put_kv(k.as_bytes(), v.as_bytes(), 1.into(), 2.into()); + cluster.must_put_cf(CF_WRITE, &k, &v); + } + for (registry, _) in &cluster.engines { + registry.for_each_opened_tablet(|_, db: &mut CachedTablet<_>| { + if let Some(db) = db.latest() { + db.flush_cf(CF_WRITE, true).unwrap(); + } + true + }) + } + + for i in 0..30 { + let k = format!("k{:02}", i); + let k = gen_delete_k(k.as_bytes(), 2.into()); + cluster.must_delete_cf(CF_WRITE, &k); + } + for (registry, _) in &cluster.engines { + registry.for_each_opened_tablet(|_, db: &mut CachedTablet<_>| { + if let Some(db) = db.latest() { + db.flush_cf(CF_WRITE, true).unwrap(); + } + true + }) + } + + let mut tablet_size_before_compact = HashMap::default(); + for (registry, _) in &cluster.engines { + registry.for_each_opened_tablet(|region_id, db: &mut CachedTablet<_>| { + if let Some(db) = db.latest() { + let cf_handle = get_cf_handle(db.as_inner(), CF_WRITE).unwrap(); + let approximate_size = db + .as_inner() + .get_approximate_sizes_cf(cf_handle, &[Range::new(b"", DATA_MAX_KEY)])[0]; + tablet_size_before_compact.insert(region_id, approximate_size); + } + true + }) + } + + let debugger = DebuggerImplV2::new( + cluster.engines[0].0.clone(), + cluster.raft_engines.get(&1).unwrap().clone(), + ConfigController::default(), + ); + + debugger + .compact(Db::Kv, CF_WRITE, &from, &to, 1, Some("skip").into()) + .unwrap(); + + let mut tablet_size_after_compact = HashMap::default(); + for (registry, _) in &cluster.engines { + registry.for_each_opened_tablet(|region_id, db: &mut CachedTablet<_>| { + if let Some(db) = db.latest() { + let cf_handle = get_cf_handle(db.as_inner(), CF_WRITE).unwrap(); + let approximate_size = db + .as_inner() + .get_approximate_sizes_cf(cf_handle, &[Range::new(b"", DATA_MAX_KEY)])[0]; + tablet_size_after_compact.insert(region_id, approximate_size); + } + true + }) + } + for (id, &size) in &tablet_size_after_compact { + if regions_compacted.contains(id) { + assert!(size == 0); + continue; + } + + assert_eq!(tablet_size_before_compact[id], size); + } + }; + + // compact the middle region + let region = regions[1].clone(); + let mut regions_compacted = HashSet::default(); + regions_compacted.insert(region.0); + let from = keys::data_key(®ion.1); + let to = keys::data_end_key(®ion.2); + check_compact(from, to, regions_compacted); + + // compact first two regions + let region1 = regions[0].clone(); + let region2 = regions[1].clone(); + let mut regions_compacted = HashSet::default(); + regions_compacted.insert(region1.0); + regions_compacted.insert(region2.0); + let from = keys::data_key(®ion1.1); + let to = keys::data_end_key(®ion2.2); + check_compact(from, to, regions_compacted); + + // compact all regions by specifying specific keys + let region1 = regions[0].clone(); + let region2 = regions[2].clone(); + let mut regions_compacted = HashSet::default(); + let _ = regions + .iter() + .map(|(id, ..)| regions_compacted.insert(*id)) + .collect::>(); + let from = keys::data_key(®ion1.1); + let to = keys::data_end_key(®ion2.2); + check_compact(from, to, regions_compacted.clone()); + + // compact all regions + check_compact(b"".to_vec(), b"".to_vec(), regions_compacted.clone()); + check_compact(b"z".to_vec(), b"z".to_vec(), regions_compacted.clone()); + check_compact(b"z".to_vec(), b"{".to_vec(), regions_compacted); +} diff --git a/tests/integrations/server/kv_service.rs b/tests/integrations/server/kv_service.rs index 5e47ad4745b..1b4a03650d9 100644 --- a/tests/integrations/server/kv_service.rs +++ b/tests/integrations/server/kv_service.rs @@ -44,7 +44,7 @@ use tikv::{ gc_worker::sync_gc, service::{batch_commands_request, batch_commands_response}, }, - storage::txn::FLASHBACK_BATCH_SIZE, + storage::{config::EngineType, txn::FLASHBACK_BATCH_SIZE}, }; use tikv_util::{ config::ReadableSize, @@ -53,9 +53,10 @@ use tikv_util::{ }; use txn_types::{Key, Lock, LockType, TimeStamp}; -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_rawkv() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (_cluster, client, ctx) = new_cluster(); let v0 = b"v0".to_vec(); let v1 = b"v1".to_vec(); let (k, v) = (b"key".to_vec(), b"v2".to_vec()); @@ -123,9 +124,10 @@ fn test_rawkv() { assert!(delete_resp.error.is_empty()); } -#[test] +#[test_case(test_raftstore::must_new_and_configure_cluster)] +#[test_case(test_raftstore_v2::must_new_and_configure_cluster)] fn test_rawkv_ttl() { - let (cluster, leader, ctx) = must_new_and_configure_cluster(|cluster| { + let (cluster, leader, ctx) = new_cluster(|cluster| { cluster.cfg.storage.enable_ttl = true; }); @@ -271,9 +273,10 @@ fn test_rawkv_ttl() { assert!(!prewrite_resp.get_errors().is_empty()); } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_mvcc_basic() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (_cluster, client, ctx) = new_cluster(); let (k, v) = (b"key".to_vec(), b"value".to_vec()); let mut ts = 0; @@ -336,9 +339,10 @@ fn test_mvcc_basic() { } } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_mvcc_rollback_and_cleanup() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (_cluster, client, ctx) = new_cluster(); let (k, v) = (b"key".to_vec(), b"value".to_vec()); let mut ts = 0; @@ -551,10 +555,11 @@ fn test_mvcc_resolve_lock_gc_and_delete() { assert!(del_resp.error.is_empty()); } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] #[cfg(feature = "failpoints")] fn test_mvcc_flashback_failed_after_first_batch() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (_cluster, client, ctx) = new_cluster(); let mut ts = 0; for i in 0..FLASHBACK_BATCH_SIZE * 2 { // Meet the constraints of the alphabetical order for test @@ -672,9 +677,10 @@ fn test_mvcc_flashback_failed_after_first_batch() { ); } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_mvcc_flashback() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (_cluster, client, ctx) = new_cluster(); let mut ts = 0; // Need to write many batches. for i in 0..2000 { @@ -714,11 +720,13 @@ fn test_mvcc_flashback() { must_kv_read_equal(&client, ctx, b"key@1".to_vec(), b"value@1".to_vec(), ts); } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_mvcc_flashback_block_rw() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (_cluster, client, ctx) = new_cluster(); // Prepare the flashback. must_prepare_flashback(&client, ctx.clone(), 1, 2); + // Try to read version 3 (after flashback, FORBIDDEN). let (k, v) = (b"key".to_vec(), b"value".to_vec()); // Get @@ -727,7 +735,11 @@ fn test_mvcc_flashback_block_rw() { get_req.key = k.clone(); get_req.version = 3; let get_resp = client.kv_get(&get_req).unwrap(); - assert!(get_resp.get_region_error().has_flashback_in_progress()); + assert!( + get_resp.get_region_error().has_flashback_in_progress(), + "{:?}", + get_resp + ); assert!(!get_resp.has_error()); assert!(get_resp.value.is_empty()); // Scan @@ -772,9 +784,10 @@ fn test_mvcc_flashback_block_rw() { must_finish_flashback(&client, ctx, 1, 2, 3); } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_mvcc_flashback_block_scheduling() { - let (mut cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (mut cluster, client, ctx) = new_cluster(); // Prepare the flashback. must_prepare_flashback(&client, ctx.clone(), 0, 1); // Try to transfer leader. @@ -783,15 +796,18 @@ fn test_mvcc_flashback_block_scheduling() { transfer_leader_resp .get_header() .get_error() - .has_flashback_in_progress() + .has_flashback_in_progress(), + "{:?}", + transfer_leader_resp ); // Finish the flashback. must_finish_flashback(&client, ctx, 0, 1, 2); } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_mvcc_flashback_unprepared() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (_cluster, client, ctx) = new_cluster(); let (k, v) = (b"key".to_vec(), b"value".to_vec()); let mut ts = 0; write_and_read_key(&client, &ctx, &mut ts, k.clone(), v.clone()); @@ -809,25 +825,16 @@ fn test_mvcc_flashback_unprepared() { must_kv_read_equal(&client, ctx.clone(), k.clone(), v, 6); // Flashback with preparing. must_flashback_to_version(&client, ctx.clone(), 0, 6, 7); - let mut get_req = GetRequest::default(); - get_req.set_context(ctx.clone()); - get_req.key = k; - get_req.version = 7; - let get_resp = client.kv_get(&get_req).unwrap(); - assert!(!get_resp.has_region_error()); - assert!(!get_resp.has_error()); - assert_eq!(get_resp.value, b"".to_vec()); + must_kv_read_not_found(&client, ctx.clone(), k.clone(), 7); // Mock the flashback retry. must_finish_flashback(&client, ctx.clone(), 0, 6, 7); - let get_resp = client.kv_get(&get_req).unwrap(); - assert!(!get_resp.has_region_error()); - assert!(!get_resp.has_error()); - assert_eq!(get_resp.value, b"".to_vec()); + must_kv_read_not_found(&client, ctx, k, 7); } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_mvcc_flashback_with_unlimited_range() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (_cluster, client, ctx) = new_cluster(); let (k, v) = (b"key".to_vec(), b"value".to_vec()); let mut ts = 0; write_and_read_key(&client, &ctx, &mut ts, k.clone(), v.clone()); @@ -853,21 +860,15 @@ fn test_mvcc_flashback_with_unlimited_range() { assert!(!resp.has_region_error()); assert!(resp.get_error().is_empty()); - let mut get_req = GetRequest::default(); - get_req.set_context(ctx); - get_req.key = k; - get_req.version = 7; - let get_resp = client.kv_get(&get_req).unwrap(); - assert!(!get_resp.has_region_error()); - assert!(!get_resp.has_error()); - assert_eq!(get_resp.value, b"".to_vec()); + must_kv_read_not_found(&client, ctx, k, 7); } // raft related RPC is tested as parts of test_snapshot.rs, so skip here. -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_coprocessor() { - let (_cluster, client, _) = must_new_cluster_and_kv_client(); + let (_cluster, client, _) = new_cluster(); // SQL push down commands let mut req = Request::default(); req.set_tp(REQ_TYPE_DAG); @@ -940,9 +941,10 @@ fn test_split_region_impl(is_raw_kv: bool) { ); } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_debug_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_debug_client)] fn test_debug_get() { - let (cluster, debug_client, store_id) = must_new_cluster_and_debug_client(); + let (cluster, debug_client, store_id) = new_cluster(); let (k, v) = (b"key", b"value"); // Put some data. @@ -968,9 +970,10 @@ fn test_debug_get() { } } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_debug_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_debug_client)] fn test_debug_raft_log() { - let (cluster, debug_client, store_id) = must_new_cluster_and_debug_client(); + let (cluster, debug_client, store_id) = new_cluster(); // Put some data. let engine = cluster.get_raft_engine(store_id); @@ -1006,6 +1009,8 @@ fn test_debug_raft_log() { } } +// Note: if modified in the future, should be sync with +// `test_debug_region_info_v2` #[test] fn test_debug_region_info() { let (cluster, debug_client, store_id) = must_new_cluster_and_debug_client(); @@ -1069,6 +1074,67 @@ fn test_debug_region_info() { } } +// Note: if modified in the future, should be sync with `test_debug_region_info` +#[test] +fn test_debug_region_info_v2() { + let (cluster, debug_client, store_id) = test_raftstore_v2::must_new_cluster_and_debug_client(); + + let raft_engine = cluster.get_raft_engine(store_id); + let region_id = 100; + let mut raft_state = raft_serverpb::RaftLocalState::default(); + raft_state.set_last_index(42); + let mut lb = raft_engine.log_batch(10); + lb.put_raft_state(region_id, &raft_state).unwrap(); + + let mut apply_state = raft_serverpb::RaftApplyState::default(); + apply_state.set_applied_index(42); + lb.put_apply_state(region_id, 42, &apply_state).unwrap(); + + let mut region_state = raft_serverpb::RegionLocalState::default(); + region_state.set_state(raft_serverpb::PeerState::Tombstone); + lb.put_region_state(region_id, 42, ®ion_state).unwrap(); + + raft_engine.consume(&mut lb, false).unwrap(); + assert_eq!( + raft_engine.get_raft_state(region_id).unwrap().unwrap(), + raft_state + ); + + assert_eq!( + raft_engine + .get_apply_state(region_id, u64::MAX) + .unwrap() + .unwrap(), + apply_state + ); + + assert_eq!( + raft_engine + .get_region_state(region_id, u64::MAX) + .unwrap() + .unwrap(), + region_state + ); + + // Debug region_info + let mut req = debugpb::RegionInfoRequest::default(); + req.set_region_id(region_id); + let mut resp = debug_client.region_info(&req).unwrap(); + assert_eq!(resp.take_raft_local_state(), raft_state); + assert_eq!(resp.take_raft_apply_state(), apply_state); + assert_eq!(resp.take_region_local_state(), region_state); + + req.set_region_id(region_id + 1); + match debug_client.region_info(&req).unwrap_err() { + Error::RpcFailure(status) => { + assert_eq!(status.code(), RpcStatusCode::NOT_FOUND); + } + _ => panic!("expect NotFound"), + } +} + +// Note: if modified in the future, should be sync with +// `test_debug_region_size_v2` #[test] fn test_debug_region_size() { let (cluster, debug_client, store_id) = must_new_cluster_and_debug_client(); @@ -1117,6 +1183,56 @@ fn test_debug_region_size() { } } +// Note: if modified in the future, should be sync with `test_debug_region_size` +#[test] +fn test_debug_region_size_v2() { + let (cluster, debug_client, store_id) = test_raftstore_v2::must_new_cluster_and_debug_client(); + let raft_engine = cluster.get_raft_engine(store_id); + let engine = cluster.get_engine(store_id); + + let mut lb = raft_engine.log_batch(10); + // Put some data. + let region_id = 1; + let mut region = metapb::Region::default(); + region.set_id(region_id); + region.set_start_key(b"a".to_vec()); + region.set_end_key(b"z".to_vec()); + let mut state = RegionLocalState::default(); + state.set_region(region); + state.set_tablet_index(5); + lb.put_region_state(region_id, 5, &state).unwrap(); + raft_engine.consume(&mut lb, false).unwrap(); + + let cfs = vec![CF_DEFAULT, CF_LOCK, CF_WRITE]; + // At lease 8 bytes for the WRITE cf. + let (k, v) = (keys::data_key(b"kkkk_kkkk"), b"v"); + for cf in &cfs { + engine.put_cf(cf, k.as_slice(), v).unwrap(); + } + + let mut req = debugpb::RegionSizeRequest::default(); + req.set_region_id(region_id); + req.set_cfs(cfs.iter().map(|s| s.to_string()).collect()); + let entries: Vec<_> = debug_client + .region_size(&req) + .unwrap() + .take_entries() + .into(); + assert_eq!(entries.len(), 3); + for e in entries { + cfs.iter().find(|&&c| c == e.cf).unwrap(); + assert!(e.size > 0); + } + + req.set_region_id(region_id + 1); + match debug_client.region_size(&req).unwrap_err() { + Error::RpcFailure(status) => { + assert_eq!(status.code(), RpcStatusCode::NOT_FOUND); + } + _ => panic!("expect NotFound"), + } +} + #[test] #[cfg(feature = "failpoints")] fn test_debug_fail_point() { @@ -1154,9 +1270,10 @@ fn test_debug_fail_point() { ); } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_debug_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_debug_client)] fn test_debug_scan_mvcc() { - let (cluster, debug_client, store_id) = must_new_cluster_and_debug_client(); + let (cluster, debug_client, store_id) = new_cluster(); let engine = cluster.get_engine(store_id); // Put some data. @@ -1238,9 +1355,10 @@ fn test_double_run_node() { cluster.shutdown(); } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_pessimistic_lock() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (_cluster, client, ctx) = new_cluster(); let (k, v) = (b"key".to_vec(), b"value".to_vec()); // Prewrite @@ -1294,9 +1412,10 @@ fn test_pessimistic_lock() { } } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_pessimistic_lock_resumable() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (_cluster, client, ctx) = new_cluster(); // Resumable pessimistic lock request with multi-key is not supported yet. let resp = kv_pessimistic_lock_resumable( @@ -1516,9 +1635,10 @@ fn test_pessimistic_lock_resumable() { } } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_check_txn_status_with_max_ts() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (_cluster, client, ctx) = new_cluster(); let (k, v) = (b"key".to_vec(), b"value".to_vec()); let lock_ts = 10; @@ -1538,29 +1658,10 @@ fn test_check_txn_status_with_max_ts() { must_kv_commit(&client, ctx, vec![k], lock_ts, lock_ts + 1, lock_ts + 1); } -fn build_client(cluster: &Cluster) -> (TikvClient, Context) { - let region = cluster.get_region(b""); - let leader = region.get_peers()[0].clone(); - let addr = cluster.sim.rl().get_addr(leader.get_store_id()); - - let env = Arc::new(Environment::new(1)); - let channel = ChannelBuilder::new(env).connect(&addr); - let client = TikvClient::new(channel); - - let mut ctx = Context::default(); - ctx.set_region_id(leader.get_id()); - ctx.set_region_epoch(region.get_region_epoch().clone()); - ctx.set_peer(leader); - - (client, ctx) -} - -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_batch_commands() { - let mut cluster = new_server_cluster(0, 1); - cluster.run(); - - let (client, _) = build_client(&cluster); + let (_cluster, client, _ctx) = new_cluster(); let (mut sender, receiver) = client.batch_commands().unwrap(); for _ in 0..1000 { let mut batch_req = BatchCommandsRequest::default(); @@ -1591,12 +1692,10 @@ fn test_batch_commands() { rx.recv_timeout(Duration::from_secs(1)).unwrap(); } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_empty_commands() { - let mut cluster = new_server_cluster(0, 1); - cluster.run(); - - let (client, _) = build_client(&cluster); + let (_cluster, client, _ctx) = new_cluster(); let (mut sender, receiver) = client.batch_commands().unwrap(); for _ in 0..1000 { let mut batch_req = BatchCommandsRequest::default(); @@ -1631,12 +1730,10 @@ fn test_empty_commands() { rx.recv_timeout(Duration::from_secs(5)).unwrap(); } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_async_commit_check_txn_status() { - let mut cluster = new_server_cluster(0, 1); - cluster.run(); - - let (client, ctx) = build_client(&cluster); + let (cluster, client, ctx) = new_cluster(); let start_ts = block_on(cluster.pd_client.get_tso()).unwrap(); let mut req = PrewriteRequest::default(); @@ -1661,16 +1758,14 @@ fn test_async_commit_check_txn_status() { assert_ne!(resp.get_action(), Action::MinCommitTsPushed); } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_prewrite_check_max_commit_ts() { - let mut cluster = new_server_cluster(0, 1); - cluster.run(); + let (cluster, client, ctx) = new_cluster(); let cm = cluster.sim.read().unwrap().get_concurrency_manager(1); cm.update_max_ts(100.into()); - let (client, ctx) = build_client(&cluster); - let mut req = PrewriteRequest::default(); req.set_context(ctx.clone()); req.set_primary_lock(b"k1".to_vec()); @@ -1732,9 +1827,10 @@ fn test_prewrite_check_max_commit_ts() { cm.read_range_check(None, None, |_, _| Err(())).unwrap(); } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_txn_heart_beat() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (_cluster, client, ctx) = new_cluster(); let mut req = TxnHeartBeatRequest::default(); let k = b"k".to_vec(); let start_ts = 10; @@ -1754,9 +1850,11 @@ fn test_txn_heart_beat() { ); } -fn test_with_memory_lock_cluster(f: impl FnOnce(TikvClient, Context, /* raw_key */ Vec, Lock)) { - let (cluster, client, ctx) = must_new_cluster_and_kv_client(); - let cm = cluster.sim.read().unwrap().get_concurrency_manager(1); +fn test_with_memory_lock_cluster( + cm: ConcurrencyManager, + client: TikvClient, + f: impl FnOnce(TikvClient, /* raw_key */ Vec, Lock), +) { let raw_key = b"key".to_vec(); let key = Key::from_raw(&raw_key); let guard = block_on(cm.lock_key(&key)); @@ -1774,12 +1872,15 @@ fn test_with_memory_lock_cluster(f: impl FnOnce(TikvClient, Context, /* raw_key guard.with_lock(|l| { *l = Some(lock.clone()); }); - f(client, ctx, raw_key, lock); + f(client, raw_key, lock); } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_batch_get_memory_lock() { - test_with_memory_lock_cluster(|client, ctx, raw_key, lock| { + let (cluster, client, ctx) = new_cluster(); + let cm = cluster.sim.read().unwrap().get_concurrency_manager(1); + test_with_memory_lock_cluster(cm, client, |client, raw_key, lock| { let mut req = BatchGetRequest::default(); req.set_context(ctx); req.set_keys(vec![b"unlocked".to_vec(), raw_key.clone()].into()); @@ -1791,9 +1892,12 @@ fn test_batch_get_memory_lock() { }); } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_kv_scan_memory_lock() { - test_with_memory_lock_cluster(|client, ctx, raw_key, lock| { + let (cluster, client, ctx) = new_cluster(); + let cm = cluster.sim.read().unwrap().get_concurrency_manager(1); + test_with_memory_lock_cluster(cm, client, |client, raw_key, lock| { let mut req = ScanRequest::default(); req.set_context(ctx); req.set_start_key(b"a".to_vec()); @@ -1847,51 +1951,12 @@ macro_rules! test_func_init { }}; } -fn setup_cluster() -> (Cluster, TikvClient, CallOption, Context) { - let mut cluster = new_server_cluster(0, 3); - cluster.run(); - - let region_id = 1; - let leader = cluster.leader_of_region(region_id).unwrap(); - let leader_addr = cluster.sim.rl().get_addr(leader.get_store_id()); - let region = cluster.get_region(b"k1"); - let follower = region - .get_peers() - .iter() - .find(|p| **p != leader) - .unwrap() - .clone(); - let follower_addr = cluster.sim.rl().get_addr(follower.get_store_id()); - let epoch = cluster.get_region_epoch(region_id); - let mut ctx = Context::default(); - ctx.set_region_id(region_id); - ctx.set_peer(leader); - ctx.set_region_epoch(epoch); - - let env = Arc::new(Environment::new(1)); - let channel = ChannelBuilder::new(env).connect(&follower_addr); - let client = TikvClient::new(channel); - - // Verify not setting forwarding header will result in store not match. - let mut put_req = RawPutRequest::default(); - put_req.set_context(ctx.clone()); - let put_resp = client.raw_put(&put_req).unwrap(); - assert!( - put_resp.get_region_error().has_store_not_match(), - "{:?}", - put_resp - ); - assert!(put_resp.error.is_empty(), "{:?}", put_resp); - - let call_opt = server::build_forward_option(&leader_addr).timeout(Duration::from_secs(3)); - (cluster, client, call_opt, ctx) -} - /// Check all supported requests can go through proxy correctly. -#[test] +#[test_case(test_raftstore::setup_cluster)] +#[test_case(test_raftstore_v2::setup_cluster)] fn test_tikv_forwarding() { - let (_cluster, client, call_opt, ctx) = setup_cluster(); - + let (_cluster, client, leader_addr, ctx) = new_cluster(); + let call_opt = server::build_forward_option(&leader_addr).timeout(Duration::from_secs(3)); // Verify not setting forwarding header will result in store not match. let mut put_req = RawPutRequest::default(); put_req.set_context(ctx.clone()); @@ -2049,9 +2114,11 @@ fn test_tikv_forwarding() { /// Test if forwarding works correctly if the target node is shutdown and /// restarted. -#[test] +#[test_case(test_raftstore::setup_cluster)] +#[test_case(test_raftstore_v2::setup_cluster)] fn test_forwarding_reconnect() { - let (mut cluster, client, call_opt, ctx) = setup_cluster(); + let (mut cluster, client, leader_addr, ctx) = new_cluster(); + let call_opt = server::build_forward_option(&leader_addr).timeout(Duration::from_secs(3)); let leader = cluster.leader_of_region(1).unwrap(); cluster.stop_node(leader.get_store_id()); @@ -2074,11 +2141,10 @@ fn test_forwarding_reconnect() { assert!(!resp.get_region_error().has_store_not_match(), "{:?}", resp); } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_health_check() { - let mut cluster = new_server_cluster(0, 1); - cluster.run(); - + let (mut cluster, _client, _ctx) = new_cluster(); let addr = cluster.sim.rl().get_addr(1); let env = Arc::new(Environment::new(1)); @@ -2095,9 +2161,10 @@ fn test_health_check() { client.check(&req).unwrap_err(); } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_get_lock_wait_info_api() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (_cluster, client, ctx) = new_cluster(); let client2 = client.clone(); let mut ctx1 = ctx.clone(); @@ -2139,7 +2206,8 @@ fn test_get_lock_wait_info_api() { // * rfc: https://github.com/tikv/rfcs/blob/master/text/0069-api-v2.md. // * proto: https://github.com/pingcap/kvproto/blob/master/proto/kvrpcpb.proto, // enum APIVersion. -#[test] +#[test_case(test_raftstore::must_new_and_configure_cluster)] +#[test_case(test_raftstore_v2::must_new_and_configure_cluster)] fn test_txn_api_version() { const TIDB_KEY_CASE: &[u8] = b"t_a"; const TXN_KEY_CASE: &[u8] = b"x\0a"; @@ -2198,9 +2266,8 @@ fn test_txn_api_version() { for (i, (storage_api_version, req_api_version, key, errcode)) in test_data.into_iter().enumerate() { - let (cluster, leader, mut ctx) = must_new_and_configure_cluster(|cluster| { - cluster.cfg.storage.set_api_version(storage_api_version) - }); + let (cluster, leader, mut ctx) = + new_cluster(|cluster| cluster.cfg.storage.set_api_version(storage_api_version)); let env = Arc::new(Environment::new(1)); let channel = ChannelBuilder::new(env).connect(&cluster.sim.rl().get_addr(leader.get_store_id())); @@ -2339,9 +2406,10 @@ fn test_txn_api_version() { } } -#[test] +#[test_case(test_raftstore::must_new_and_configure_cluster)] +#[test_case(test_raftstore_v2::must_new_and_configure_cluster)] fn test_storage_with_quota_limiter_enable() { - let (cluster, leader, ctx) = must_new_and_configure_cluster(|cluster| { + let (cluster, leader, ctx) = new_cluster(|cluster| { // write_bandwidth is limited to 1, which means that every write request will // trigger the limit. let quota_config = QuotaConfig { @@ -2375,9 +2443,10 @@ fn test_storage_with_quota_limiter_enable() { assert!(begin.elapsed() > Duration::from_millis(500)); } -#[test] +#[test_case(test_raftstore::must_new_and_configure_cluster)] +#[test_case(test_raftstore_v2::must_new_and_configure_cluster)] fn test_storage_with_quota_limiter_disable() { - let (cluster, leader, ctx) = must_new_and_configure_cluster(|cluster| { + let (cluster, leader, ctx) = new_cluster(|cluster| { // all limit set to 0, which means quota limiter not work. let quota_config = QuotaConfig::default(); cluster.cfg.quota = quota_config; @@ -2405,9 +2474,11 @@ fn test_storage_with_quota_limiter_disable() { assert!(begin.elapsed() < Duration::from_millis(500)); } -#[test] +#[test_case(test_raftstore::must_new_and_configure_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_and_configure_cluster_and_kv_client)] fn test_commands_write_detail() { - let (_cluster, client, ctx) = must_new_and_configure_cluster_and_kv_client(|cluster| { + test_util::init_log_for_test(); + let (cluster, client, ctx) = new_cluster(|cluster| { cluster.cfg.pessimistic_txn.pipelined = false; cluster.cfg.pessimistic_txn.in_memory = false; }); @@ -2428,7 +2499,11 @@ fn test_commands_write_detail() { // Mutex has been removed from write path. // Ref https://github.com/facebook/rocksdb/pull/7516 // assert!(wd.get_apply_mutex_lock_nanos() > 0); - assert!(wd.get_apply_write_wal_nanos() > 0); + + // MultiRocksDB does not have wal + if cluster.cfg.storage.engine == EngineType::RaftKv { + assert!(wd.get_apply_write_wal_nanos() > 0); + } assert!(wd.get_apply_write_memtable_nanos() > 0); assert!(wd.get_process_nanos() > 0); }; @@ -2480,12 +2555,10 @@ fn test_commands_write_detail() { check_write_detail(commit_resp.get_exec_details_v2().get_write_detail()); } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_rpc_wall_time() { - let mut cluster = new_server_cluster(0, 1); - cluster.run(); - - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (_cluster, client, ctx) = new_cluster(); let k = b"key".to_vec(); let mut get_req = GetRequest::default(); get_req.set_context(ctx); @@ -2546,9 +2619,10 @@ fn test_rpc_wall_time() { } } -#[test] +#[test_case(test_raftstore::must_new_cluster_and_kv_client)] +#[test_case(test_raftstore_v2::must_new_cluster_and_kv_client)] fn test_pessimistic_lock_execution_tracking() { - let (_cluster, client, ctx) = must_new_cluster_and_kv_client(); + let (_cluster, client, ctx) = new_cluster(); let (k, v) = (b"k1".to_vec(), b"k2".to_vec()); // Add a prewrite lock. diff --git a/tests/integrations/server/mod.rs b/tests/integrations/server/mod.rs index dc89eb63fc8..fb813106cce 100644 --- a/tests/integrations/server/mod.rs +++ b/tests/integrations/server/mod.rs @@ -1,5 +1,6 @@ // Copyright 2018 TiKV Project Authors. Licensed under Apache-2.0. +mod debugger; mod gc_worker; mod kv_service; mod lock_manager; diff --git a/tests/integrations/server/raft_client.rs b/tests/integrations/server/raft_client.rs index fa7a86f12c4..aad9ab7ceb1 100644 --- a/tests/integrations/server/raft_client.rs +++ b/tests/integrations/server/raft_client.rs @@ -19,7 +19,7 @@ use kvproto::{ tikvpb::BatchRaftMessage, }; use raft::eraftpb::Entry; -use raftstore::{errors::DiscardReason, store::StoreMsg}; +use raftstore::errors::DiscardReason; use tikv::server::{ self, load_statistics::ThreadLoadPool, raftkv::RaftRouterWrap, resolve, resolve::Callback, Config, ConnectionBuilder, RaftClient, StoreAddrResolver, TestRaftStoreRouter, @@ -28,7 +28,6 @@ use tikv_kv::{FakeExtension, RaftExtension}; use tikv_util::{ config::{ReadableDuration, VersionTrack}, worker::{Builder as WorkerBuilder, LazyWorker}, - Either, }; use super::*; @@ -73,7 +72,7 @@ where worker.scheduler(), loads, ); - RaftClient::new(builder) + RaftClient::new(0, builder) } fn get_raft_client_by_port(port: u16) -> RaftClient { @@ -206,59 +205,6 @@ fn test_raft_client_reconnect() { drop(mock_server); } -#[test] -// Test raft_client reports store unreachable only once until being connected -// again -fn test_raft_client_report_unreachable() { - let msg_count = Arc::new(AtomicUsize::new(0)); - let batch_msg_count = Arc::new(AtomicUsize::new(0)); - let service = MockKvForRaft::new(Arc::clone(&msg_count), Arc::clone(&batch_msg_count), true); - let (mut mock_server, port) = create_mock_server(service, 60100, 60200).unwrap(); - - let (tx, rx) = mpsc::channel(); - let (significant_msg_sender, _significant_msg_receiver) = mpsc::channel(); - let router = TestRaftStoreRouter::new(tx, significant_msg_sender); - let wrap = RaftRouterWrap::new(router); - let mut raft_client = get_raft_client(wrap, StaticResolver::new(port)); - - // server is disconnected - mock_server.shutdown(); - drop(mock_server); - - raft_client.send(RaftMessage::default()).unwrap(); - let msg = rx.recv_timeout(Duration::from_millis(200)).unwrap(); - if let Either::Right(StoreMsg::StoreUnreachable { store_id }) = msg { - assert_eq!(store_id, 0); - } else { - panic!("expect StoreUnreachable"); - } - // no more unreachable message is sent until it's connected again. - rx.recv_timeout(Duration::from_millis(200)).unwrap_err(); - - // restart the mock server. - let service = MockKvForRaft::new(Arc::clone(&msg_count), batch_msg_count, true); - let mut mock_server = create_mock_server_on(service, port); - - // make sure the connection is connected, otherwise the following sent messages - // may be dropped - std::thread::sleep(Duration::from_millis(200)); - (0..50).for_each(|_| raft_client.send(RaftMessage::default()).unwrap()); - raft_client.flush(); - check_msg_count(500, &msg_count, 50); - - // server is disconnected - mock_server.take().unwrap().shutdown(); - - let msg = rx.recv_timeout(Duration::from_millis(200)).unwrap(); - if let Either::Right(StoreMsg::StoreUnreachable { store_id }) = msg { - assert_eq!(store_id, 0); - } else { - panic!("expect StoreUnreachable"); - } - // no more unreachable message is sent until it's connected again. - rx.recv_timeout(Duration::from_millis(200)).unwrap_err(); -} - #[test] fn test_batch_size_limit() { let msg_count = Arc::new(AtomicUsize::new(0)); diff --git a/tests/integrations/storage/test_raftkv.rs b/tests/integrations/storage/test_raftkv.rs index 3dcdab0cf6b..72ea50fa184 100644 --- a/tests/integrations/storage/test_raftkv.rs +++ b/tests/integrations/storage/test_raftkv.rs @@ -11,7 +11,7 @@ use raft::eraftpb::MessageType; use test_raftstore::*; use tikv::storage::{kv::*, CfStatistics}; use tikv_util::{codec::bytes, HandyRwLock}; -use txn_types::{Key, Lock, LockType}; +use txn_types::{Key, Lock, LockType, TimeStamp}; #[test] fn test_raftkv() { @@ -255,20 +255,27 @@ fn test_read_on_replica_check_memory_locks() { follower_ctx.set_region_epoch(region.get_region_epoch().clone()); follower_ctx.set_peer(follower_peer.as_ref().unwrap().clone()); follower_ctx.set_replica_read(true); - let mut range = KeyRange::default(); - range.set_start_key(encoded_key.as_encoded().to_vec()); - let follower_snap_ctx = SnapContext { - pb_ctx: &follower_ctx, - start_ts: Some(100.into()), - key_ranges: vec![range], - ..Default::default() - }; - let mut follower_storage = cluster.sim.rl().storages[&follower_id].clone(); - match follower_storage.snapshot(follower_snap_ctx) { - Err(Error(box ErrorInner::KeyIsLocked(lock_info))) => { - assert_eq!(lock_info, lock.into_lock_info(raw_key.to_vec())) + for use_max_ts in [false, true] { + let mut range = KeyRange::default(); + range.set_start_key(encoded_key.as_encoded().to_vec()); + let ts = if use_max_ts { + Some(TimeStamp::max()) + } else { + Some(100.into()) + }; + let follower_snap_ctx = SnapContext { + pb_ctx: &follower_ctx, + start_ts: ts, + key_ranges: vec![range], + ..Default::default() + }; + let mut follower_storage = cluster.sim.rl().storages[&follower_id].clone(); + match follower_storage.snapshot(follower_snap_ctx) { + Err(Error(box ErrorInner::KeyIsLocked(lock_info))) => { + assert_eq!(lock_info, lock.clone().into_lock_info(raw_key.to_vec())) + } + other => panic!("unexpected result: {:?}", other), } - other => panic!("unexpected result: {:?}", other), } } @@ -315,7 +322,7 @@ fn test_invalid_read_index_when_no_leader() { true, ); request.mut_header().set_peer(follower.clone()); - let (cb, rx) = make_cb(&request); + let (cb, mut rx) = make_cb(&request); cluster .sim .rl()