diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c0a65efb..2215ce38 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -70,7 +70,7 @@ jobs: uses: actions-rs/cargo@v1 with: command: clippy - args: --workspace --tests --all-features -- -D warnings + args: --workspace --exclude raft-engine --tests --all-features -- -D warnings - name: Check format uses: actions-rs/cargo@v1 with: diff --git a/Cargo.lock b/Cargo.lock index ca962944..6a456ea8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -28,6 +28,20 @@ dependencies = [ "version_check", ] +[[package]] +name = "ahash" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +dependencies = [ + "cfg-if", + "const-random", + "getrandom", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.3" @@ -37,6 +51,18 @@ dependencies = [ "memchr", ] +[[package]] +name = "allocator-api2" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f" + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + [[package]] name = "anyhow" version = "1.0.81" @@ -247,6 +273,12 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + [[package]] name = "cc" version = "1.0.90" @@ -272,6 +304,33 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + [[package]] name = "clang-sys" version = "1.7.0" @@ -306,7 +365,7 @@ version = "3.2.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ae6371b8bdc8b7d3959e9cf7b22d4435ef3e79e138688421ec654acf8c81b008" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro-error", "proc-macro2", "quote", @@ -355,10 +414,30 @@ dependencies = [ "rust-ini", "serde", "serde_json", - "toml", + "toml 0.5.11", "yaml-rust", ] +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom", + "once_cell", + "tiny-keccak", +] + [[package]] name = "const-str" version = "0.4.3" @@ -399,6 +478,42 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "criterion" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7c76e09c1aae2bc52b3d2f29e13c6572553b30c4aa1b8a49fd70de6412654cb" +dependencies = [ + "anes", + "atty", + "cast", + "ciborium", + "clap", + "criterion-plot", + "itertools", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + [[package]] name = "crossbeam" version = "0.8.4" @@ -455,6 +570,12 @@ version = "0.8.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + [[package]] name = "crypto-common" version = "0.1.6" @@ -475,6 +596,16 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "ctor" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edb49164822f3ee45b17acd4a208cfc1251410cf0cad9a833234c9890774dd9f" +dependencies = [ + "quote", + "syn 2.0.58", +] + [[package]] name = "dashmap" version = "5.5.3" @@ -566,6 +697,19 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c34f04666d835ff5d62e058c3995147c06f42fe86ff053337632bca83e42702d" +[[package]] +name = "env_logger" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd405aab171cb85d6735e5c8d9db038c17d3ca007a4d2c25f337935c3d90580" +dependencies = [ + "humantime", + "is-terminal", + "log", + "regex", + "termcolor", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -830,13 +974,23 @@ dependencies = [ "tracing", ] +[[package]] +name = "half" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +dependencies = [ + "cfg-if", + "crunchy", +] + [[package]] name = "hashbrown" version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" dependencies = [ - "ahash", + "ahash 0.7.8", ] [[package]] @@ -844,6 +998,10 @@ name = "hashbrown" version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +dependencies = [ + "ahash 0.8.11", + "allocator-api2", +] [[package]] name = "heck" @@ -851,6 +1009,12 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + [[package]] name = "hermit-abi" version = "0.1.19" @@ -915,6 +1079,12 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + [[package]] name = "hyper" version = "0.14.28" @@ -1000,6 +1170,15 @@ dependencies = [ "hashbrown 0.14.3", ] +[[package]] +name = "instant" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" +dependencies = [ + "cfg-if", +] + [[package]] name = "io-lifetimes" version = "1.0.11" @@ -1100,6 +1279,12 @@ dependencies = [ "windows-targets 0.52.4", ] +[[package]] +name = "libm" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" + [[package]] name = "libredox" version = "0.1.3" @@ -1201,11 +1386,20 @@ version = "2.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" +[[package]] +name = "memmap2" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322" +dependencies = [ + "libc", +] + [[package]] name = "memoffset" -version = "0.6.5" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" +checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4" dependencies = [ "autocfg", ] @@ -1277,11 +1471,10 @@ dependencies = [ [[package]] name = "nix" -version = "0.25.1" +version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f346ff70e7dbfd675fe90590b92d59ef2de15a8779ae305ebcbfd3f0caf59be4" +checksum = "598beaf3cc6fdd9a5dfb1630c2800c7acd31df7aaf0f565796fba2b53ca1af1b" dependencies = [ - "autocfg", "bitflags 1.3.2", "cfg-if", "libc", @@ -1337,13 +1530,13 @@ checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" [[package]] name = "num-derive" -version = "0.3.3" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "876a53fff98e03a936a674b29568b0e605f06b29372c2489ff4de23f1949743d" +checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.58", ] [[package]] @@ -1353,6 +1546,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -1380,6 +1574,12 @@ version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "oorandom" +version = "11.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" + [[package]] name = "openssl" version = "0.10.64" @@ -1591,6 +1791,34 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" +[[package]] +name = "plotters" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a15b6eccb8484002195a3e44fe65a4ce8e93a625797a063735536fd59cb01cf3" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "414cec62c6634ae900ea1c56128dfe87cf63e7caece0852ec76aba307cebadb7" + +[[package]] +name = "plotters-svg" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81b30686a7d9c3e010b84284bdd26a29f2138574f52f5eb6f794fc0ad924e705" +dependencies = [ + "plotters-backend", +] + [[package]] name = "powerfmt" version = "0.2.0" @@ -1717,7 +1945,7 @@ dependencies = [ "bytes", "cfg-if", "cmake", - "heck", + "heck 0.4.1", "itertools", "lazy_static", "log", @@ -1737,7 +1965,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270" dependencies = [ "bytes", - "heck", + "heck 0.4.1", "itertools", "lazy_static", "log", @@ -1799,9 +2027,9 @@ dependencies = [ [[package]] name = "protobuf" -version = "2.28.0" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94" +checksum = "8aefcec9f142b524d98fc81d07827743be89dd6586a1ba6ab21fa66a500b3fa5" [[package]] name = "protobuf-build" @@ -1863,35 +2091,43 @@ dependencies = [ [[package]] name = "raft-engine" -version = "0.2.0" -source = "git+https://github.com/w41ter/raft-engine.git?branch=prost-0.11#2c7d2d900f09243bdaa8f82bd42445c9b60509a0" +version = "0.4.2" dependencies = [ "byteorder", "crc32fast", + "criterion", "crossbeam", + "ctor 0.2.8", + "env_logger", "fail", "fs2", - "hashbrown 0.12.3", + "hashbrown 0.14.3", "hex", "if_chain", "lazy_static", "libc", "log", "lz4-sys", - "nix 0.25.1", + "memmap2", + "nix 0.26.4", "num-derive", "num-traits", "parking_lot", "prometheus", "prometheus-static-metric", "prost 0.11.9", - "protobuf", + "raft", + "rand 0.8.5", + "rand_distr", "rayon", + "rhai", "scopeguard", "serde", "serde_repr", "strum", + "tempfile", "thiserror", + "toml 0.8.14", ] [[package]] @@ -1963,6 +2199,16 @@ dependencies = [ "getrandom", ] +[[package]] +name = "rand_distr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +dependencies = [ + "num-traits", + "rand 0.8.5", +] + [[package]] name = "rayon" version = "1.10.0" @@ -2105,6 +2351,34 @@ dependencies = [ "winreg", ] +[[package]] +name = "rhai" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a7d88770120601ba1e548bb6bc2a05019e54ff01b51479e38e64ec3b59d4759" +dependencies = [ + "ahash 0.8.11", + "bitflags 2.5.0", + "instant", + "num-traits", + "once_cell", + "rhai_codegen", + "smallvec", + "smartstring", + "thin-vec", +] + +[[package]] +name = "rhai_codegen" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59aecf17969c04b9c0c5d21f6bc9da9fec9dd4980e64d1871443a476589d8c86" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.58", +] + [[package]] name = "rocksdb" version = "0.22.0" @@ -2230,6 +2504,15 @@ version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "schannel" version = "0.1.23" @@ -2292,7 +2575,7 @@ dependencies = [ "serde_json", "tabled", "tokio", - "toml", + "toml 0.5.11", "tracing", "tracing-subscriber", ] @@ -2315,7 +2598,7 @@ version = "0.5.0" dependencies = [ "async-stream", "crc32fast", - "ctor", + "ctor 0.1.26", "derivative", "futures", "lazy_static", @@ -2445,7 +2728,7 @@ dependencies = [ "async-stream", "const-str", "crc32fast", - "ctor", + "ctor 0.1.26", "dashmap", "derivative", "futures", @@ -2536,6 +2819,15 @@ dependencies = [ "syn 2.0.58", ] +[[package]] +name = "serde_spanned" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79e674e01f999af37c49f70a6ede167a8a60b2503e56c5599532a65baa5969a0" +dependencies = [ + "serde", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -2666,6 +2958,17 @@ version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" +[[package]] +name = "smartstring" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fb72c633efbaa2dd666986505016c32c3044395ceaf881518399d2f4127ee29" +dependencies = [ + "autocfg", + "static_assertions", + "version_check", +] + [[package]] name = "socket2" version = "0.4.10" @@ -2686,6 +2989,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "strsim" version = "0.10.0" @@ -2694,24 +3003,24 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "strum" -version = "0.24.1" +version = "0.26.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" +checksum = "5d8cec3501a5194c432b2b7976db6b7d10ec95c253208b45f83f7136aa985e29" dependencies = [ "strum_macros", ] [[package]] name = "strum_macros" -version = "0.24.3" +version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e385be0d24f186b4ce2f9982191e7101bb737312ad61c1f2f984f34bcf85d59" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" dependencies = [ - "heck", + "heck 0.5.0", "proc-macro2", "quote", "rustversion", - "syn 1.0.109", + "syn 2.0.58", ] [[package]] @@ -2795,7 +3104,7 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c138f99377e5d653a371cdad263615634cfc8467685dfe8e73e2b8e98f44b17" dependencies = [ - "heck", + "heck 0.4.1", "proc-macro-error", "proc-macro2", "quote", @@ -2856,6 +3165,12 @@ version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "23d434d3f8967a09480fb04132ebe0a3e088c173e6d0ee7897abbdf4eab0f8b9" +[[package]] +name = "thin-vec" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a38c90d48152c236a3ab59271da4f4ae63d678c5d7ad6b7714d7cb9760be5e4b" + [[package]] name = "thiserror" version = "1.0.58" @@ -2917,6 +3232,25 @@ dependencies = [ "time-core", ] +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "tinyvec" version = "1.6.0" @@ -3017,6 +3351,40 @@ dependencies = [ "serde", ] +[[package]] +name = "toml" +version = "0.8.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f49eb2ab21d2f26bd6db7bf383edc527a7ebaee412d17af4d40fdccd442f335" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4badfd56924ae69bcc9039335b2e017639ce3f9b001c393c1b2d1ef846ce2cbf" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f21c7aaf97f1bd9ca9d4f9e73b0a6c74bd5afef56f2bc931943a6e1c37e04e38" +dependencies = [ + "indexmap 2.2.6", + "serde", + "serde_spanned", + "toml_datetime", + "winnow", +] + [[package]] name = "tonic" version = "0.8.3" @@ -3260,6 +3628,16 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "want" version = "0.3.1" @@ -3592,6 +3970,15 @@ version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" +[[package]] +name = "winnow" +version = "0.6.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59b5e5f6c299a3c7890b876a2a587f3115162487e704907d9b6cd29473052ba1" +dependencies = [ + "memchr", +] + [[package]] name = "winreg" version = "0.50.0" @@ -3611,6 +3998,26 @@ dependencies = [ "linked-hash-map", ] +[[package]] +name = "zerocopy" +version = "0.7.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae87e3fcd617500e5d106f0380cf7b77f3c6092aae37191433159dda23cfb087" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15e934569e47891f7d9411f1a451d947a60e000ab3bd24fbb970f000387d1b3b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.58", +] + [[package]] name = "zstd-sys" version = "2.0.10+zstd.1.5.6" diff --git a/Cargo.toml b/Cargo.toml index e5ebe6d3..186e7bc7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,12 @@ members = [ "src/runtime", "src/schema", "src/server", + + # layers "layers/etcd", + + # third + "third/raft-engine", ] resolver = "2" diff --git a/Makefile b/Makefile index f5d0d0a4..c6d18c12 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ etcd: .PHONY: lint ## lint : Lint codespace lint: - $(V)cargo clippy --workspace --tests --all-features -- -D warnings + $(V)cargo clippy --workspace --exclude raft-engine --tests --all-features -- -D warnings .PHONY: fmt ## fmt : Format all code diff --git a/src/bin/src/bench/report.rs b/src/bin/src/bench/report.rs index 19599b09..841bfdba 100644 --- a/src/bin/src/bench/report.rs +++ b/src/bin/src/bench/report.rs @@ -135,20 +135,20 @@ fn histogram_diff( .get_sample_count() .saturating_sub(earlier.get_histogram().get_sample_count()), ); - h.mut_bucket().extend( - current - .get_histogram() - .get_bucket() - .iter() - .zip(earlier.get_histogram().get_bucket().iter()) - .map(|(b1, b2)| { - let cumulative_count = b1.get_cumulative_count() - b2.get_cumulative_count(); - let mut b = prometheus::proto::Bucket::default(); - b.set_cumulative_count(cumulative_count); - b.set_upper_bound(b1.get_upper_bound()); - b - }), - ); + + current + .get_histogram() + .get_bucket() + .iter() + .zip(earlier.get_histogram().get_bucket().iter()) + .map(|(b1, b2)| { + let cumulative_count = b1.get_cumulative_count() - b2.get_cumulative_count(); + let mut b = prometheus::proto::Bucket::default(); + b.set_cumulative_count(cumulative_count); + b.set_upper_bound(b1.get_upper_bound()); + b + }) + .for_each(|v| h.mut_bucket().push(v)); m.set_histogram(h); m diff --git a/src/server/Cargo.toml b/src/server/Cargo.toml index 97677092..126b09d5 100644 --- a/src/server/Cargo.toml +++ b/src/server/Cargo.toml @@ -57,10 +57,9 @@ default-features = false branch = "prost-0.11" [dependencies.raft-engine] -git = "https://github.com/w41ter/raft-engine.git" +path = "../../third/raft-engine" default-features = false features = ["prost"] -branch = "prost-0.11" [dependencies.rocksdb] git = "https://github.com/w41ter/rust-rocksdb.git" diff --git a/third/raft-engine/CHANGELOG.md b/third/raft-engine/CHANGELOG.md new file mode 100644 index 00000000..96195407 --- /dev/null +++ b/third/raft-engine/CHANGELOG.md @@ -0,0 +1,91 @@ +# Raft Engine Change Log + +## [Unreleased] + +### New Features + +* Add a new Prometheus metric `raft_engine_write_compression_ratio` to track compression ratio of write #358 + +## [0.4.2] - 2024-04-16 + +### Behavior Changes + +* Periodically flush unsynced bytes when rewriting to avoid I/O jitters if flushing too many bytes impede the foreground writes. (#347) +* Errors will be returned if rewriting fails, instread of `panic` directly. (#343) + +## [0.4.1] - 2023-09-14 + +### Behavior Changes + +* When log recycling is enabled, Raft Engine will now retain 50% more log files to reduce the chance of running out. +* Reduce the scope of keys reserved for internal use. + +## [0.4.0] - 2023-09-01 + +### Behavior Changes + +* `LogBatch::put` returns a `Result<()>` instead of `()`. It errs when the key is reserved for internal use. +* Possible to specify a permission in `FileSystem::open`. +* Prometheus counter `raft_engine_log_file_count` no longer includes retired log files that are stashed for recycling. Those files are now tracked by a new counter `raft_engine_recycled_file_count`. + +### Bug Fixes + +* Fix data loss caused by aborted rewrite operation. Downgrading to an earlier version without the fix may produce phantom Raft Groups or keys, i.e. never written but appear in queries. +* Fix a potential bug that an un-persisted log batch is mistakenly recovered and causes checksum mismatch error when being read later. + +### New Features + +* Support preparing prefilled logs to enable log recycling when start-up. The amount of logs to prepare is controlled by `Config::prefill_limit`. +* Add a new configuration `spill-dir` to allow automatic placement of logs into an auxiliary directory when `dir` is full. +* Add a new method `Engine::fork` to duplicate an `Engine` to a new place, with a few disk file copies. +* Support configuring lz4 acceleration factor with `compression-level`. + +## [0.3.0] - 2022-09-14 + +### Bug Fixes + +* Unconditionally tolerate `fallocate` failures as a fix to its portability issue. Errors other than `EOPNOTSUPP` will still emit a warning. +* Avoid leaving fractured write after failure by reseeking the file writer. Panic if the reseek fails as well. +* Fix a parallel recovery panic bug. +* Fix panic when an empty batch is written to engine and then reused. + +### New Features + +* Add `PerfContext` which records detailed time breakdown of the write process to thread-local storage. +* Support recycling obsolete log files to reduce the cost of `fallocate`-ing new ones. + +### Public API Changes + +* Add `is_empty` to `Engine` API. +* Add metadata deletion capability to `FileSystem` trait. Users can implement `exists_metadata` and `delete_metadata` to clean up obsolete metadata from older versions of Raft Engine. +* Add `Engine::scan_messages` and `Engine::scan_raw_messages` for iterating over written key-values. +* Add `Engine::get` for getting raw value. +* Move `sync` from `env::WriteExt` to `env::Handle`. +* Deprecate `bytes_per_sync`. +* Add `env::Handle::sync_range` for sync in asynchornous. + +### Behavior Changes + +* Change format version to 2 from 1 by default. +* Enable log recycling by default. + +## [0.2.0] - 2022-05-25 + +### Bug Fixes + +* Fix a false negative case of `LogBatch::is_empty()` #212 +* Fix fsync ordering when rotating log file #219 + +### New Features + +* Support limiting the memory usage of Raft Engine under new feature `swap` #211 +* Add a new Prometheus counter `raft_engine_memory_usage` to track memory usage #207 + +### Improvements + +* Reduce memory usage by 25% #206 + +### Public API Changes + +* Introduce a new error type `Full` #206 +* `LogBatch::merge` returns a `Result<()>` instead of `()` #206 diff --git a/third/raft-engine/Cargo.toml b/third/raft-engine/Cargo.toml new file mode 100644 index 00000000..7fbba1ee --- /dev/null +++ b/third/raft-engine/Cargo.toml @@ -0,0 +1,82 @@ +[package] +name = "raft-engine" +version = "0.4.2" +authors = ["The TiKV Project Developers"] +edition = "2018" +rust-version = "1.75.0" +description = "A persistent storage engine for Multi-Raft logs" +readme = "README.md" +repository = "https://github.com/tikv/raft-engine" +license = "Apache-2.0" + +[package.metadata.docs.rs] +features = ["internals"] + +[[test]] +name = "failpoints" +path = "tests/failpoints/mod.rs" +required-features = ["internals", "failpoints"] + +[[bench]] +name = "benches" +path = "tests/benches/mod.rs" +harness = false +required-features = ["failpoints"] + +[dependencies] +byteorder = "1.2" +crc32fast = "1.2" +crossbeam = "0.8" +fail = "0.5" +fs2 = "0.4" +hashbrown = "0.14" +hex = "0.4" +if_chain = "1.0" +lazy_static = "1.3" +libc = "0.2" +log = { version = "0.4", features = [ + "max_level_trace", + "release_max_level_debug", +] } +lz4-sys = "1.9" +memmap2 = { version = "0.9", optional = true } +nix = "0.26" +num-derive = "0.4" +num-traits = "0.2" +parking_lot = "0.12" +prometheus = { workspace = true } +prometheus-static-metric.workspace = true +rayon = "1.5" +rhai = { version = "1.7", features = ["sync"], optional = true } +scopeguard = "1.1" +serde = { version = "1.0", features = ["derive"] } +serde_repr = "0.1" +strum = { version = "0.26.2", features = ["derive"] } +thiserror = "1.0" +prost = { version = "0.11", optional = true } + +[dev-dependencies] +criterion = "0.4" +ctor = "0.2" +env_logger = "0.10" +rand = "0.8" +rand_distr = "0.4" +tempfile = "3.6" +toml = "0.8" + +[features] +default = ["internals", "scripting"] +internals = [] +nightly = ["prometheus/nightly"] +failpoints = ["fail/failpoints"] +scripting = ["rhai"] +swap = ["nightly", "memmap2"] +std_fs = [] + +nightly_group = ["nightly", "swap"] + +[dependencies.raft] +git = "https://github.com/w41ter/raft-rs.git" +features = ["prost-codec", "default-logger"] +default-features = false +branch = "prost-0.11" diff --git a/third/raft-engine/ctl/Cargo.toml b/third/raft-engine/ctl/Cargo.toml new file mode 100644 index 00000000..2071705c --- /dev/null +++ b/third/raft-engine/ctl/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "raft-engine-ctl" +version = "0.4.2" +authors = ["The TiKV Project Developers"] +edition = "2018" +rust-version = "1.75.0" +description = "A control tool for Raft Engine" +repository = "https://github.com/tikv/raft-engine" +license = "Apache-2.0" + +[dependencies] +clap = { version = "3.1", features = ["derive", "cargo"] } +env_logger = "0.10" +raft-engine = { path = "..", version = "0.4.1", features = [ + "scripting", + "internals", +] } diff --git a/third/raft-engine/ctl/src/lib.rs b/third/raft-engine/ctl/src/lib.rs new file mode 100644 index 00000000..3f48c55a --- /dev/null +++ b/third/raft-engine/ctl/src/lib.rs @@ -0,0 +1,164 @@ +// Copyright (c) 2017-present, PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +//! # Raft Engine Control + +use std::path::Path; +use std::sync::Arc; + +use clap::{crate_authors, crate_version, Parser}; +use raft_engine::env::{DefaultFileSystem, FileSystem}; +use raft_engine::internals::LogQueue; +use raft_engine::{Engine, Error, Result as EngineResult}; + +#[derive(Debug, clap::Parser)] +#[clap( + name = "ctl", + author = crate_authors!(), + version = crate_version!(), + dont_collapse_args_in_usage = true, +)] +pub struct ControlOpt { + // sub command type + #[clap(subcommand)] + cmd: Option, +} + +#[derive(Debug, Parser)] +enum Cmd { + /// Dump log entries in data file(s). + Dump { + /// Path of Raft Engine directory or specific log file. + #[clap(short, long)] + path: String, + + #[clap(short, long, use_value_delimiter = true)] + raft_groups: Vec, + }, + + /// Check data files for logical errors. + Check { + /// Path of Raft Engine directory. + #[clap(short, long)] + path: String, + }, + + /// Run Rhai script to repair data files. + Repair { + /// Path of Raft Engine directory. + #[clap(short, long)] + path: String, + + #[clap( + short, + long, + possible_values = &["append", "rewrite", "all"] + )] + queue: String, + + /// Path of Rhai script file. + #[clap(short, long)] + script: String, + }, + + /// Try running `purge_expired_files` on existing data directory. + TryPurge { + /// Path of Raft Engine directory. + #[clap(short, long)] + path: String, + }, +} + +fn convert_queue(queue: &str) -> Option { + match queue { + "append" => Some(LogQueue::Append), + "rewrite" => Some(LogQueue::Rewrite), + "all" => None, + _ => unreachable!(), + } +} + +impl ControlOpt { + pub fn validate_and_execute(self) -> EngineResult<()> { + self.validate_and_execute_with_file_system(Arc::new(DefaultFileSystem)) + } + + pub fn validate_and_execute_with_file_system( + mut self, + fs: Arc, + ) -> EngineResult<()> { + if self.cmd.is_none() { + return Err(Error::InvalidArgument("subcommand is needed".to_owned())); + } + + match self.cmd.take().unwrap() { + Cmd::Dump { path, raft_groups } => { + let it = Engine::dump_with_file_system(Path::new(&path), fs)?; + for item in it { + if let Ok(v) = item { + if raft_groups.is_empty() || raft_groups.contains(&v.raft_group_id) { + println!("{v:?}") + } + } else { + // output error message + println!("{item:?}") + } + } + } + Cmd::Repair { + path, + queue, + script, + } => { + let script = std::fs::read_to_string(script)?; + Engine::unsafe_repair_with_file_system( + Path::new(&path), + convert_queue(&queue), + script, + fs, + )?; + } + Cmd::Check { path } => { + let r = Engine::consistency_check_with_file_system(Path::new(&path), fs)?; + if r.is_empty() { + println!("All data is Ok") + } else { + println!("Corrupted info are as follows:\nraft_group_id, last_intact_index\n"); + r.iter().for_each(|(x, y)| println!("{x:?}, {y:?}")) + } + } + Cmd::TryPurge { path } => { + let e = Engine::open_with_file_system( + raft_engine::Config { + dir: path, + ..Default::default() + }, + fs, + )?; + println!( + "purge_expired_files() returns {:?}", + e.purge_expired_files()? + ); + } + } + Ok(()) + } +} + +pub fn run_command(mut args: Vec, fs: Arc) { + args.insert(0, "ctl".to_owned()); + let opts = ControlOpt::parse_from(args); + if let Err(e) = opts.validate_and_execute_with_file_system(fs) { + println!("{e:?}"); + } +} diff --git a/third/raft-engine/ctl/src/main.rs b/third/raft-engine/ctl/src/main.rs new file mode 100644 index 00000000..13643e66 --- /dev/null +++ b/third/raft-engine/ctl/src/main.rs @@ -0,0 +1,13 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use clap::Parser; +use raft_engine_ctl::ControlOpt; + +fn main() { + env_logger::init(); + let opts: ControlOpt = ControlOpt::parse(); + + if let Err(e) = opts.validate_and_execute() { + println!("{e:?}"); + } +} diff --git a/third/raft-engine/rustfmt.toml b/third/raft-engine/rustfmt.toml new file mode 100644 index 00000000..b2715b26 --- /dev/null +++ b/third/raft-engine/rustfmt.toml @@ -0,0 +1 @@ +wrap_comments = true diff --git a/third/raft-engine/src/codec.rs b/third/raft-engine/src/codec.rs new file mode 100644 index 00000000..0cfb4968 --- /dev/null +++ b/third/raft-engine/src/codec.rs @@ -0,0 +1,678 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +#![allow(dead_code)] + +use std::io::{self, ErrorKind, Write}; +use std::mem; + +use byteorder::{BigEndian, ByteOrder, LittleEndian, WriteBytesExt}; +use thiserror::Error; + +pub type BytesSlice<'a> = &'a [u8]; + +#[derive(Debug, Error)] +pub enum Error { + #[error("{0}")] + Io(#[from] io::Error), + #[error("bad format key(length)")] + KeyLength, + #[error("bad format key(padding)")] + KeyPadding, + #[error("key not found")] + KeyNotFound, +} + +impl Error { + pub fn unexpected_eof() -> Error { + Error::Io(io::Error::new(ErrorKind::UnexpectedEof, "eof")) + } +} + +pub type Result = std::result::Result; + +const SIGN_MARK: u64 = 0x8000000000000000; +const MAX_VAR_I64_LEN: usize = 10; +const MAX_VAR_U64_LEN: usize = 10; +const U64_SIZE: usize = 8; +const I64_SIZE: usize = 8; +const F64_SIZE: usize = 8; + +fn order_encode_i64(v: i64) -> u64 { + v as u64 ^ SIGN_MARK +} + +fn order_decode_i64(u: u64) -> i64 { + (u ^ SIGN_MARK) as i64 +} + +fn order_encode_f64(v: f64) -> u64 { + let u = v.to_bits(); + if v.is_sign_positive() { + u | SIGN_MARK + } else { + !u + } +} + +fn order_decode_f64(u: u64) -> f64 { + let u = if u & SIGN_MARK > 0 { + u & (!SIGN_MARK) + } else { + !u + }; + f64::from_bits(u) +} + +pub trait NumberEncoder: Write { + /// Writes the encoded value to buf. + /// It guarantees that the encoded value is in ascending order for + /// comparison. + fn encode_i64(&mut self, v: i64) -> Result<()> { + let u = order_encode_i64(v); + self.encode_u64(u) + } + + /// Writes the encoded value to buf. + /// It guarantees that the encoded value is in descending order for + /// comparison. + fn encode_i64_desc(&mut self, v: i64) -> Result<()> { + let u = order_encode_i64(v); + self.encode_u64_desc(u) + } + + /// Writes the encoded value to slice buf. + /// It guarantees that the encoded value is in ascending order for + /// comparison. + fn encode_u64(&mut self, v: u64) -> Result<()> { + self.write_u64::(v).map_err(From::from) + } + + /// Writes the encoded value to slice buf. + /// It guarantees that the encoded value is in descending order for + /// comparison. + fn encode_u64_desc(&mut self, v: u64) -> Result<()> { + self.write_u64::(!v).map_err(From::from) + } + + /// Writes the encoded value to slice buf in big endian order. + fn encode_u32(&mut self, v: u32) -> Result<()> { + self.write_u32::(v).map_err(From::from) + } + + /// Writes the encoded value to slice buf in big endian order. + fn encode_u16(&mut self, v: u16) -> Result<()> { + self.write_u16::(v).map_err(From::from) + } + + /// Writes the encoded value to slice buf. + /// Note that the encoded result is not memcomparable. + fn encode_var_i64(&mut self, v: i64) -> Result<()> { + let mut vx = (v as u64) << 1; + if v < 0 { + vx = !vx; + } + self.encode_var_u64(vx) + } + + /// Writes the encoded value to slice buf. + /// Note that the encoded result is not memcomparable. + fn encode_var_u64(&mut self, mut v: u64) -> Result<()> { + while v >= 0x80 { + self.write_u8(v as u8 | 0x80)?; + v >>= 7; + } + self.write_u8(v as u8).map_err(From::from) + } + + /// Writes the encoded value to slice buf. + /// It guarantees that the encoded value is in ascending order for + /// comparison. + fn encode_f64(&mut self, f: f64) -> Result<()> { + let u = order_encode_f64(f); + self.encode_u64(u) + } + + /// Writes the encoded value to slice buf. + /// It guarantees that the encoded value is in descending order for + /// comparison. + fn encode_f64_desc(&mut self, f: f64) -> Result<()> { + let u = order_encode_f64(f); + self.encode_u64_desc(u) + } + + /// Writes `u16` numbers in little endian order. + fn encode_u16_le(&mut self, v: u16) -> Result<()> { + self.write_u16::(v).map_err(From::from) + } + + /// Writes `f32` numbers in little endian order. + fn encode_f32_le(&mut self, v: f32) -> Result<()> { + self.write_f32::(v).map_err(From::from) + } + + /// Writes `u32` numbers in little endian order. + fn encode_u32_le(&mut self, v: u32) -> Result<()> { + self.write_u32::(v).map_err(From::from) + } + + /// Writes `i32` numbers in little endian order. + fn encode_i32_le(&mut self, v: i32) -> Result<()> { + self.write_i32::(v).map_err(From::from) + } + + /// Writes `f64` numbers in little endian order. + fn encode_f64_le(&mut self, v: f64) -> Result<()> { + self.write_f64::(v).map_err(From::from) + } + + /// Writes `i64` numbers in little endian order. + fn encode_i64_le(&mut self, v: i64) -> Result<()> { + self.write_i64::(v).map_err(From::from) + } + + /// Writes `u64` numbers in little endian order. + fn encode_u64_le(&mut self, v: u64) -> Result<()> { + self.write_u64::(v).map_err(From::from) + } +} + +impl NumberEncoder for T {} + +#[inline] +fn read_num_bytes(size: usize, data: &mut &[u8], f: F) -> Result +where + F: Fn(&[u8]) -> T, +{ + if data.len() >= size { + let buf = &data[..size]; + *data = &data[size..]; + return Ok(f(buf)); + } + Err(Error::unexpected_eof()) +} + +/// Decodes value encoded by `encode_i64` before. +#[inline] +pub fn decode_i64(data: &mut BytesSlice<'_>) -> Result { + decode_u64(data).map(order_decode_i64) +} + +/// Decodes value encoded by `encode_i64_desc` before. +#[inline] +pub fn decode_i64_desc(data: &mut BytesSlice<'_>) -> Result { + decode_u64_desc(data).map(order_decode_i64) +} + +/// Decodes value encoded by `encode_u64` before. +#[inline] +pub fn decode_u64(data: &mut BytesSlice<'_>) -> Result { + read_num_bytes(mem::size_of::(), data, BigEndian::read_u64) +} + +/// Decodes value encoded by `encode_u32` before. +#[inline] +pub fn decode_u32(data: &mut BytesSlice<'_>) -> Result { + read_num_bytes(mem::size_of::(), data, BigEndian::read_u32) +} + +/// Decodes value encoded by `encode_u16` before. +#[inline] +pub fn decode_u16(data: &mut BytesSlice<'_>) -> Result { + read_num_bytes(mem::size_of::(), data, BigEndian::read_u16) +} + +/// Decodes value encoded by `encode_u64_desc` before. +#[inline] +pub fn decode_u64_desc(data: &mut BytesSlice<'_>) -> Result { + let v = decode_u64(data)?; + Ok(!v) +} + +/// Decodes value encoded by `encode_var_i64` before. +#[inline] +pub fn decode_var_i64(data: &mut BytesSlice<'_>) -> Result { + let v = decode_var_u64(data)?; + let vx = v >> 1; + if v & 1 == 0 { + Ok(vx as i64) + } else { + Ok(!vx as i64) + } +} + +/// Decodes value encoded by `encode_var_u64` before. +#[inline] +pub fn decode_var_u64(data: &mut BytesSlice<'_>) -> Result { + if !data.is_empty() { + // process with value < 127 independently at first + // since it matches most of the cases. + if data[0] < 0x80 { + let res = u64::from(data[0]) & 0x7f; + *data = unsafe { data.get_unchecked(1..) }; + return Ok(res); + } + + // process with data's len >=10 or data ends with var u64 + if data.len() >= 10 || *data.last().unwrap() < 0x80 { + let mut res = 0; + for i in 0..9 { + let b = unsafe { *data.get_unchecked(i) }; + res |= (u64::from(b) & 0x7f) << (i * 7); + if b < 0x80 { + *data = unsafe { data.get_unchecked(i + 1..) }; + return Ok(res); + } + } + let b = unsafe { *data.get_unchecked(9) }; + if b <= 1 { + res |= ((u64::from(b)) & 0x7f) << (9 * 7); + *data = unsafe { data.get_unchecked(10..) }; + return Ok(res); + } + return Err(Error::Io(io::Error::new( + ErrorKind::InvalidData, + "overflow", + ))); + } + } + + // process data's len < 10 && data not end with var u64. + let mut res = 0; + for i in 0..data.len() { + let b = data[i]; + res |= (u64::from(b) & 0x7f) << (i * 7); + if b < 0x80 { + *data = unsafe { data.get_unchecked(i + 1..) }; + return Ok(res); + } + } + Err(Error::unexpected_eof()) +} + +/// Decodes value encoded by `encode_f64` before. +#[inline] +pub fn decode_f64(data: &mut BytesSlice<'_>) -> Result { + decode_u64(data).map(order_decode_f64) +} + +/// Decodes value encoded by `encode_f64_desc` before. +#[inline] +pub fn decode_f64_desc(data: &mut BytesSlice<'_>) -> Result { + decode_u64_desc(data).map(order_decode_f64) +} + +/// Decodes value encoded by `encode_u16_le` before. +#[inline] +pub fn decode_u16_le(data: &mut BytesSlice<'_>) -> Result { + read_num_bytes(mem::size_of::(), data, LittleEndian::read_u16) +} + +/// Decodes value encoded by `encode_u32_le` before. +#[inline] +pub fn decode_u32_le(data: &mut BytesSlice<'_>) -> Result { + read_num_bytes(mem::size_of::(), data, LittleEndian::read_u32) +} + +/// Decodes value encoded by `encode_i32_le` before. +#[inline] +pub fn decode_i32_le(data: &mut BytesSlice<'_>) -> Result { + read_num_bytes(mem::size_of::(), data, LittleEndian::read_i32) +} + +/// Decodes value encoded by `encode_f64_le` before. +#[inline] +pub fn decode_f64_le(data: &mut BytesSlice<'_>) -> Result { + read_num_bytes(mem::size_of::(), data, LittleEndian::read_f64) +} + +/// Decodes value encoded by `encode_f32_le` before. +#[inline] +pub fn decode_f32_le(data: &mut BytesSlice<'_>) -> Result { + read_num_bytes(mem::size_of::(), data, LittleEndian::read_f32) +} + +/// Decodes value encoded by `encode_i64_le` before. +#[inline] +pub fn decode_i64_le(data: &mut BytesSlice<'_>) -> Result { + let v = decode_u64_le(data)?; + Ok(v as i64) +} + +/// Decodes value encoded by `encode_u64_le` before. +#[inline] +pub fn decode_u64_le(data: &mut BytesSlice<'_>) -> Result { + read_num_bytes(mem::size_of::(), data, LittleEndian::read_u64) +} + +#[inline] +pub fn read_u8(data: &mut BytesSlice<'_>) -> Result { + if !data.is_empty() { + let v = data[0]; + *data = &data[1..]; + Ok(v) + } else { + Err(Error::unexpected_eof()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::ErrorKind; + use std::{f32, f64, i16, i32, i64, u16, u32, u64}; + + const U16_TESTS: &[u16] = &[ + i16::MIN as u16, + i16::MAX as u16, + u16::MIN, + u16::MAX, + 0, + 1, + 2, + 10, + 20, + 63, + 64, + 65, + 127, + 128, + 129, + 255, + 256, + 257, + 1024, + ]; + + const F32_TESTS: &[f32] = &[ + f32::EPSILON, + f32::MIN, + f32::MIN_POSITIVE, + f32::MAX, + // NAN is unequal to itself + // f32::NAN, + f32::INFINITY, + f32::NEG_INFINITY, + ]; + + const U32_TESTS: &[u32] = &[ + i32::MIN as u32, + i32::MAX as u32, + u32::MIN, + u32::MAX, + 0, + 1, + 2, + 10, + 20, + 63, + 64, + 65, + 127, + 128, + 129, + 255, + 256, + 257, + 1024, + ]; + + const U64_TESTS: &[u64] = &[ + i64::MIN as u64, + i64::MAX as u64, + u64::MIN, + u64::MAX, + 0, + 1, + 2, + 10, + 20, + 63, + 64, + 65, + 127, + 128, + 129, + 255, + 256, + 257, + 1024, + ]; + const I64_TESTS: &[i64] = &[ + i64::MIN, + i64::MAX, + u64::MIN as i64, + u64::MAX as i64, + -1, + 0, + 1, + 2, + 10, + 20, + 63, + 64, + 65, + 127, + 128, + 129, + 255, + 256, + 257, + 1024, + -1023, + ]; + + const F64_TESTS: &[f64] = &[ + -1.0, + 0.0, + 1.0, + f64::MAX, + f64::MIN, + f32::MAX as f64, + f32::MIN as f64, + f64::MIN_POSITIVE, + f32::MIN_POSITIVE as f64, + f64::INFINITY, + f64::NEG_INFINITY, + ]; + + const I32_TESTS: &[i32] = &[ + i32::MIN, + i32::MAX, + 0, + 1, + 2, + 10, + 20, + 63, + 64, + 65, + 127, + 128, + 129, + 255, + 256, + 257, + -1024, + ]; + + // use macro to generate order tests for number codecs. + macro_rules! test_order { + ($arr:expr, $sorted:expr, $enc:ident, $dec:ident) => { + let mut encoded: Vec<_> = $arr + .iter() + .map(|e| { + let mut buf = vec![]; + buf.$enc(*e).unwrap(); + buf + }) + .collect(); + encoded.sort(); + let decoded: Vec<_> = encoded + .iter() + .map(|b| $dec(&mut b.as_slice()).unwrap()) + .collect(); + assert_eq!(decoded, $sorted); + }; + } + + // use macro to generate serialization tests for number codecs. + macro_rules! test_serialize { + ($tag:ident, $enc:ident, $dec:ident, $cases:expr) => { + #[test] + fn $tag() { + for &v in $cases { + let mut buf = vec![]; + buf.$enc(v).unwrap(); + assert!(buf.len() <= MAX_VAR_I64_LEN); + assert_eq!(v, $dec(&mut buf.as_slice()).unwrap()); + } + } + }; + } + + // use macro to generate serialization and order tests for number codecs. + macro_rules! test_codec { + ($enc:ident, $dec:ident, $compare:expr, $cases:expr) => { + #[allow(unused_imports)] + #[allow(clippy::float_cmp)] + mod $enc { + use super::{F64_TESTS, I64_TESTS, U16_TESTS, U32_TESTS, U64_TESTS}; + use crate::codec::*; + + test_serialize!(serialize, $enc, $dec, $cases); + + #[test] + fn test_order() { + let mut ordered_case = $cases.to_vec(); + #[allow(clippy::unnecessary_sort_by)] + ordered_case.sort_by($compare); + test_order!($cases, ordered_case, $enc, $dec); + } + } + }; + } + + test_codec!(encode_i64, decode_i64, |a, b| a.cmp(b), I64_TESTS); + test_codec!(encode_u32, decode_u32, |a, b| a.cmp(b), U32_TESTS); + test_codec!(encode_u16, decode_u16, |a, b| a.cmp(b), U16_TESTS); + test_codec!(encode_i64_desc, decode_i64_desc, |a, b| b.cmp(a), I64_TESTS); + test_codec!(encode_u64, decode_u64, |a, b| a.cmp(b), U64_TESTS); + test_codec!(encode_u64_desc, decode_u64_desc, |a, b| b.cmp(a), U64_TESTS); + test_codec!( + encode_f64, + decode_f64, + |a, b| a.partial_cmp(b).unwrap(), + F64_TESTS + ); + test_codec!( + encode_f64_desc, + decode_f64_desc, + |a, b| b.partial_cmp(a).unwrap(), + F64_TESTS + ); + + test_serialize!( + var_i64_little_endian_codec, + encode_i64_le, + decode_i64_le, + I64_TESTS + ); + test_serialize!( + var_u64_little_endian_codec, + encode_u64_le, + decode_u64_le, + U64_TESTS + ); + test_serialize!( + var_i32_little_endian_codec, + encode_i32_le, + decode_i32_le, + I32_TESTS + ); + test_serialize!(var_u16_codec, encode_u16_le, decode_u16_le, U16_TESTS); + test_serialize!( + var_f16_codec_check_eq, // work around float_cmp lint + encode_f32_le, + decode_f32_le, + F32_TESTS + ); + test_serialize!(var_u32_codec, encode_u32_le, decode_u32_le, U32_TESTS); + test_serialize!(var_i64_codec, encode_var_i64, decode_var_i64, I64_TESTS); + + #[test] + #[allow(clippy::float_cmp)] + fn test_var_f64_le() { + for &v in F64_TESTS { + let mut buf = vec![]; + buf.encode_f64_le(v).unwrap(); + let value = decode_f64_le(&mut buf.as_slice()).unwrap(); + assert_eq!(v, value); + } + } + + #[test] + fn test_var_u64_codec() { + for &v in U64_TESTS { + let mut buf = vec![]; + buf.encode_var_u64(v).unwrap(); + assert!(buf.len() <= MAX_VAR_I64_LEN); + let decoded = decode_var_u64(&mut buf.as_slice()).unwrap(); + assert_eq!(v, decoded); + } + } + + // test if a `Result` is expected io error. + macro_rules! check_error { + ($e:expr, $k:expr) => { + match $e { + Err(Error::Io(e)) => assert_eq!(e.kind(), $k), + o => panic!("expect {:?}, got {:?}", $k, o), + } + }; + } + + // generate bound check test for number codecs. + macro_rules! test_eof { + ($tag:ident, $enc:ident, $dec:ident, $case:expr) => { + #[test] + fn $tag() { + let mut buf = vec![0; 7]; + check_error!(buf.as_mut_slice().$enc($case), ErrorKind::WriteZero); + check_error!($dec(&mut buf.as_slice()), ErrorKind::UnexpectedEof); + } + }; + } + + test_eof!(i64_eof, encode_i64, decode_i64, 1); + test_eof!(u64_eof, encode_u64, decode_u64, 1); + test_eof!(f64_eof, encode_f64, decode_f64, 1.0); + test_eof!(i64_desc_eof, encode_i64_desc, decode_i64_desc, 1); + test_eof!(u64_desc_eof, encode_u64_desc, decode_u64_desc, 1); + test_eof!(f64_desc_eof, encode_f64_desc, decode_f64_desc, 1.0); + + #[test] + fn test_var_eof() { + let mut buf = vec![0x80; 9]; + buf.push(0x2); + check_error!(decode_var_u64(&mut buf.as_slice()), ErrorKind::InvalidData); + check_error!(decode_var_i64(&mut buf.as_slice()), ErrorKind::InvalidData); + + buf = vec![0x80; 3]; + check_error!( + decode_var_u64(&mut buf.as_slice()), + ErrorKind::UnexpectedEof + ); + check_error!(decode_var_u64(&mut [].as_slice()), ErrorKind::UnexpectedEof); + + buf.push(0); + assert_eq!(0, decode_var_u64(&mut buf.as_slice()).unwrap()); + } + + #[test] + fn test_u8_eof() { + let buf = vec![7]; + let mut slice = buf.as_slice(); + assert_eq!(7, read_u8(&mut slice).unwrap()); + check_error!(read_u8(&mut slice), ErrorKind::UnexpectedEof); + } +} diff --git a/third/raft-engine/src/config.rs b/third/raft-engine/src/config.rs new file mode 100644 index 00000000..4283737f --- /dev/null +++ b/third/raft-engine/src/config.rs @@ -0,0 +1,370 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use log::{info, warn}; +use serde::{Deserialize, Serialize}; + +use crate::pipe_log::Version; +use crate::{util::ReadableSize, Result}; + +const MIN_RECOVERY_READ_BLOCK_SIZE: usize = 512; +const MIN_RECOVERY_THREADS: usize = 1; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] +pub enum RecoveryMode { + AbsoluteConsistency, + // For backward compatibility. + #[serde( + alias = "tolerate-corrupted-tail-records", + rename(serialize = "tolerate-corrupted-tail-records") + )] + TolerateTailCorruption, + TolerateAnyCorruption, +} + +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +#[serde(default)] +#[serde(rename_all = "kebab-case")] +pub struct Config { + /// Main directory to store log files. Will create on startup if not exists. + /// + /// Default: "" + pub dir: String, + + /// Auxiliary directory to store log files. Will create on startup if + /// set but not exists. + /// + /// Newly logs will be put into this dir when the main `dir` is full + /// and no spare space for new logs. + /// + /// Default: None + pub spill_dir: Option, + + /// How to deal with file corruption during recovery. + /// + /// Default: "tolerate-tail-corruption". + pub recovery_mode: RecoveryMode, + /// Minimum I/O size for reading log files during recovery. + /// + /// Default: "16KB". Minimum: "512B". + pub recovery_read_block_size: ReadableSize, + /// The number of threads used to scan and recovery log files. + /// + /// Default: 4. Minimum: 1. + pub recovery_threads: usize, + + /// Compress a log batch if its size exceeds this value. Setting it to zero + /// disables compression. + /// + /// Default: "8KB" + pub batch_compression_threshold: ReadableSize, + /// Acceleration factor for LZ4 compression. It can be fine tuned, with each + /// successive value providing roughly +~3% to speed. The value will be + /// capped within [1, 65537] by LZ4. + /// + /// Default: 1. + pub compression_level: Option, + /// Deprecated. + /// Incrementally sync log files after specified bytes have been written. + /// Setting it to zero disables incremental sync. + /// + /// Default: "4MB" + pub bytes_per_sync: Option, + + /// Version of the log file. + /// + /// Default: 2 + pub format_version: Version, + + /// Target file size for rotating log files. + /// + /// Default: "128MB" + pub target_file_size: ReadableSize, + + /// Purge append log queue if its size exceeds this value. + /// + /// Default: "10GB" + pub purge_threshold: ReadableSize, + /// Purge rewrite log queue if its size exceeds this value. + /// + /// Default: MAX(`purge_threshold` / 10, `target_file_size`) + pub purge_rewrite_threshold: Option, + /// Purge rewrite log queue if its garbage ratio exceeds this value. + /// + /// Default: "0.6" + pub purge_rewrite_garbage_ratio: f64, + + /// Maximum memory bytes allowed for the in-memory index. + /// Effective under the `swap` feature only. + /// + /// Default: None + pub memory_limit: Option, + + /// Whether to recycle stale log files. + /// If `true`, logically purged log files will be reserved for recycling. + /// Only available for `format_version` 2 and above. + /// + /// Default: true + pub enable_log_recycle: bool, + + /// Whether to prepare log files for recycling when start. + /// If `true`, batch empty log files will be prepared for recycling when + /// starting engine. + /// Only available for `enable-log-reycle` is true. + /// + /// Default: false + pub prefill_for_recycle: bool, + + /// Maximum capacity for preparing log files for recycling when start. + /// If `None`, its size is equal to `purge-threshold`*1.5. + /// Only available for `prefill-for-recycle` is true. + /// + /// Default: None + pub prefill_limit: Option, +} + +impl Default for Config { + fn default() -> Config { + #[allow(unused_mut)] + let mut cfg = Config { + dir: "".to_owned(), + spill_dir: None, + recovery_mode: RecoveryMode::TolerateTailCorruption, + recovery_read_block_size: ReadableSize::kb(16), + recovery_threads: 4, + batch_compression_threshold: ReadableSize::kb(8), + compression_level: None, + bytes_per_sync: None, + format_version: Version::V2, + target_file_size: ReadableSize::mb(128), + purge_threshold: ReadableSize::gb(10), + purge_rewrite_threshold: None, + purge_rewrite_garbage_ratio: 0.6, + memory_limit: None, + enable_log_recycle: true, + prefill_for_recycle: false, + prefill_limit: None, + }; + // Test-specific configurations. + #[cfg(test)] + { + cfg.memory_limit = Some(ReadableSize(0)); + } + cfg + } +} + +impl Config { + pub fn sanitize(&mut self) -> Result<()> { + if self.purge_threshold.0 < self.target_file_size.0 { + return Err(box_err!("purge-threshold < target-file-size")); + } + if self.purge_rewrite_threshold.is_none() { + self.purge_rewrite_threshold = Some(ReadableSize(std::cmp::max( + self.purge_threshold.0 / 10, + self.target_file_size.0, + ))); + } + if self.bytes_per_sync.is_some() { + warn!("bytes-per-sync has been deprecated."); + } + let min_recovery_read_block_size = ReadableSize(MIN_RECOVERY_READ_BLOCK_SIZE as u64); + if self.recovery_read_block_size < min_recovery_read_block_size { + warn!( + "recovery-read-block-size ({}) is too small, setting it to {min_recovery_read_block_size}", + self.recovery_read_block_size + ); + self.recovery_read_block_size = min_recovery_read_block_size; + } + if self.recovery_threads < MIN_RECOVERY_THREADS { + warn!( + "recovery-threads ({}) is too small, setting it to {MIN_RECOVERY_THREADS}", + self.recovery_threads + ); + self.recovery_threads = MIN_RECOVERY_THREADS; + } + if self.enable_log_recycle && !self.format_version.has_log_signing() { + return Err(box_err!( + "format version {} doesn't support log recycle, use 2 or above", + self.format_version + )); + } + if !self.enable_log_recycle && self.prefill_for_recycle { + return Err(box_err!( + "prefill is not allowed when log recycle is disabled" + )); + } + if !self.prefill_for_recycle && self.prefill_limit.is_some() { + warn!("prefill-limit will be ignored when prefill is disabled"); + self.prefill_limit = None; + } + if self.prefill_for_recycle && self.prefill_limit.is_none() { + info!("prefill-limit will be calibrated to purge-threshold"); + self.prefill_limit = Some(self.purge_threshold); + } + #[cfg(not(feature = "swap"))] + if self.memory_limit.is_some() { + warn!("memory-limit will be ignored because swap feature is disabled"); + } + Ok(()) + } + + /// Returns the capacity for recycling log files. + pub(crate) fn recycle_capacity(&self) -> usize { + // Attention please, log files with Version::V1 could not be recycled, it might + // cause LogBatchs in a mess in the recycled file, where the reader might get + // an obsolete entries (unexpected) from the recycled file. + if !self.format_version.has_log_signing() { + return 0; + } + if self.enable_log_recycle && self.purge_threshold.0 >= self.target_file_size.0 { + // (1) At most u32::MAX so that the file number can be capped into an u32 + // without colliding. (2) Increase the threshold by 50% to add some more file + // as an additional buffer to avoid jitters. + std::cmp::min( + (self.purge_threshold.0 / self.target_file_size.0) as usize * 3 / 2, + u32::MAX as usize, + ) + } else { + 0 + } + } + + /// Returns the capacity for preparing log files for recycling when start. + pub(crate) fn prefill_capacity(&self) -> usize { + // Attention please, log files with Version::V1 could not be recycled, so it's + // useless for prefill. + if !self.enable_log_recycle || !self.format_version.has_log_signing() { + return 0; + } + let prefill_limit = self.prefill_limit.unwrap_or(ReadableSize(0)).0; + if self.prefill_for_recycle && prefill_limit >= self.target_file_size.0 { + // Keep same with the maximum setting of `recycle_capacity`. + std::cmp::min( + (prefill_limit / self.target_file_size.0) as usize * 3 / 2, + u32::MAX as usize, + ) + } else { + 0 + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_serde() { + let value = Config::default(); + let dump = toml::to_string_pretty(&value).unwrap(); + let load = toml::from_str(&dump).unwrap(); + assert_eq!(value, load); + assert!(load.spill_dir.is_none()); + } + + #[test] + fn test_custom() { + let custom = r#" + dir = "custom_dir" + spill-dir = "custom_spill_dir" + recovery-mode = "tolerate-tail-corruption" + bytes-per-sync = "2KB" + target-file-size = "1MB" + purge-threshold = "3MB" + format-version = 1 + enable-log-recycle = false + prefill-for-recycle = false + "#; + let mut load: Config = toml::from_str(custom).unwrap(); + assert_eq!(load.dir, "custom_dir"); + assert_eq!(load.spill_dir, Some("custom_spill_dir".to_owned())); + assert_eq!(load.recovery_mode, RecoveryMode::TolerateTailCorruption); + assert_eq!(load.bytes_per_sync, Some(ReadableSize::kb(2))); + assert_eq!(load.target_file_size, ReadableSize::mb(1)); + assert_eq!(load.purge_threshold, ReadableSize::mb(3)); + assert_eq!(load.format_version, Version::V1); + assert_eq!(load.enable_log_recycle, false); + assert_eq!(load.prefill_for_recycle, false); + load.sanitize().unwrap(); + } + + #[test] + fn test_invalid() { + let hard_error = r#" + target-file-size = "5MB" + purge-threshold = "3MB" + "#; + let mut hard_load: Config = toml::from_str(hard_error).unwrap(); + assert!(hard_load.sanitize().is_err()); + + let soft_error = r#" + recovery-read-block-size = 1 + recovery-threads = 0 + target-file-size = "5000MB" + format-version = 2 + enable-log-recycle = true + prefill-for-recycle = true + "#; + let soft_load: Config = toml::from_str(soft_error).unwrap(); + assert!(soft_load.recovery_read_block_size.0 < MIN_RECOVERY_READ_BLOCK_SIZE as u64); + assert!(soft_load.recovery_threads < MIN_RECOVERY_THREADS); + let mut soft_sanitized = soft_load; + soft_sanitized.sanitize().unwrap(); + assert!(soft_sanitized.recovery_read_block_size.0 >= MIN_RECOVERY_READ_BLOCK_SIZE as u64); + assert!(soft_sanitized.recovery_threads >= MIN_RECOVERY_THREADS); + assert_eq!( + soft_sanitized.purge_rewrite_threshold.unwrap(), + soft_sanitized.target_file_size + ); + + let recycle_error = r#" + enable-log-recycle = true + format-version = 1 + "#; + let mut cfg_load: Config = toml::from_str(recycle_error).unwrap(); + assert!(cfg_load.sanitize().is_err()); + + let prefill_error = r#" + enable-log-recycle = false + prefill-for-recycle = true + format-version = 2 + "#; + let mut cfg_load: Config = toml::from_str(prefill_error).unwrap(); + assert!(cfg_load.sanitize().is_err()); + } + + #[test] + fn test_backward_compactibility() { + // Upgrade from older version. + let old = r#" + recovery-mode = "tolerate-corrupted-tail-records" + "#; + let mut load: Config = toml::from_str(old).unwrap(); + load.sanitize().unwrap(); + // Downgrade to older version. + assert!(toml::to_string(&load) + .unwrap() + .contains("tolerate-corrupted-tail-records")); + } + + #[test] + fn test_prefill_for_recycle() { + let default_prefill_v1 = r#" + enable-log-recycle = true + prefill-for-recycle = true + "#; + let mut cfg_load: Config = toml::from_str(default_prefill_v1).unwrap(); + assert!(cfg_load.sanitize().is_ok()); + assert_eq!(cfg_load.prefill_limit.unwrap(), cfg_load.purge_threshold); + + let default_prefill_v2 = r#" + enable-log-recycle = true + prefill-for-recycle = false + prefill-limit = "20GB" + "#; + let mut cfg_load: Config = toml::from_str(default_prefill_v2).unwrap(); + assert!(cfg_load.sanitize().is_ok()); + assert!(cfg_load.prefill_limit.is_none()); + } +} diff --git a/third/raft-engine/src/consistency.rs b/third/raft-engine/src/consistency.rs new file mode 100644 index 00000000..c400a664 --- /dev/null +++ b/third/raft-engine/src/consistency.rs @@ -0,0 +1,71 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use hashbrown::HashMap; + +use crate::file_pipe_log::ReplayMachine; +use crate::log_batch::{LogItemBatch, LogItemContent}; +use crate::pipe_log::{FileId, LogQueue}; +use crate::Result; + +/// A `ConsistencyChecker` scans for log entry holes in a log queue. It will +/// return a list of corrupted raft groups along with their last valid log +/// index. +#[derive(Default)] +pub struct ConsistencyChecker { + // Mappings from raft group id to (first-index, last-index). + raft_groups: HashMap, + // Mappings from raft group id to last valid index. + corrupted: HashMap, +} + +impl ConsistencyChecker { + pub fn finish(self) -> HashMap { + self.corrupted + } +} + +impl ReplayMachine for ConsistencyChecker { + fn replay(&mut self, item_batch: LogItemBatch, _file_id: FileId) -> Result<()> { + for item in item_batch.iter() { + if let LogItemContent::EntryIndexes(ents) = &item.content { + if !ents.0.is_empty() { + let incoming_first_index = ents.0.first().unwrap().index; + let incoming_last_index = ents.0.last().unwrap().index; + let index_range = self + .raft_groups + .entry(item.raft_group_id) + .or_insert((incoming_first_index, incoming_last_index)); + if index_range.1 + 1 < incoming_first_index { + self.corrupted + .entry(item.raft_group_id) + .or_insert(index_range.1); + } + index_range.1 = incoming_last_index; + } + } + } + Ok(()) + } + + fn merge(&mut self, mut rhs: Self, _queue: LogQueue) -> Result<()> { + let mut corrupted_between_rhs: HashMap = HashMap::default(); + for (id, (first, last)) in rhs.raft_groups.drain() { + self.raft_groups + .entry(id) + .and_modify(|(_, l)| { + if *l + 1 < first { + corrupted_between_rhs.insert(id, *l); + } + *l = last; + }) + .or_insert((first, last)); + } + for (id, last_index) in corrupted_between_rhs.drain() { + self.corrupted.entry(id).or_insert(last_index); + } + for (id, last_index) in rhs.corrupted.drain() { + self.corrupted.entry(id).or_insert(last_index); + } + Ok(()) + } +} diff --git a/third/raft-engine/src/engine.rs b/third/raft-engine/src/engine.rs new file mode 100644 index 00000000..e53d3a3b --- /dev/null +++ b/third/raft-engine/src/engine.rs @@ -0,0 +1,2805 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use std::cell::{Cell, RefCell}; +use std::marker::PhantomData; +use std::path::Path; +use std::sync::{mpsc, Arc, Mutex}; +use std::thread::{Builder as ThreadBuilder, JoinHandle}; +use std::time::{Duration, Instant}; + +use log::{error, info}; +use prost::Message; + +use crate::config::{Config, RecoveryMode}; +use crate::consistency::ConsistencyChecker; +use crate::env::{DefaultFileSystem, FileSystem}; +use crate::event_listener::EventListener; +use crate::file_pipe_log::debug::LogItemReader; +use crate::file_pipe_log::{DefaultMachineFactory, FilePipeLog, FilePipeLogBuilder}; +use crate::log_batch::{Command, LogBatch, MessageExt}; +use crate::memtable::{EntryIndex, MemTableRecoverContextFactory, MemTables}; +use crate::metrics::*; +use crate::pipe_log::{FileBlockHandle, LogQueue, PipeLog}; +use crate::purge::{PurgeHook, PurgeManager}; +use crate::write_barrier::{WriteBarrier, Writer}; +use crate::{perf_context, Error, GlobalStats, Result}; + +const METRICS_FLUSH_INTERVAL: Duration = Duration::from_secs(30); +/// Max times for `write`. +const MAX_WRITE_ATTEMPT: u64 = 2; + +pub struct Engine> +where + F: FileSystem, + P: PipeLog, +{ + cfg: Arc, + listeners: Vec>, + + instance_id: u64, + #[allow(dead_code)] + stats: Arc, + memtables: MemTables, + pipe_log: Arc

, + purge_manager: PurgeManager

, + + write_barrier: WriteBarrier>, + + tx: Mutex>, + metrics_flusher: Option>, + + _phantom: PhantomData, +} + +impl Engine> { + pub fn open(cfg: Config) -> Result>> { + Self::open_with_listeners(cfg, vec![]) + } + + pub fn open_with_listeners( + cfg: Config, + listeners: Vec>, + ) -> Result>> { + Self::open_with(cfg, Arc::new(DefaultFileSystem), listeners) + } +} + +impl Engine> +where + F: FileSystem, +{ + pub fn open_with_file_system( + cfg: Config, + file_system: Arc, + ) -> Result>> { + Self::open_with(cfg, file_system, vec![]) + } + + pub fn open_with( + mut cfg: Config, + file_system: Arc, + mut listeners: Vec>, + ) -> Result>> { + use std::hash::{DefaultHasher, Hash, Hasher}; + + cfg.sanitize()?; + listeners.push(Arc::new(PurgeHook::default()) as Arc); + + let mut hasher = DefaultHasher::new(); + cfg.dir.hash(&mut hasher); + let instance_id = hasher.finish(); + + let start = Instant::now(); + let mut builder = FilePipeLogBuilder::new(cfg.clone(), file_system, listeners.clone()); + builder.scan()?; + let factory = MemTableRecoverContextFactory::new(&cfg); + let (append, rewrite) = builder.recover(&factory)?; + let pipe_log = Arc::new(builder.finish()?); + rewrite.merge_append_context(append); + let (memtables, stats) = rewrite.finish(); + info!( + "Recovering raft logs for instance {} takes {:?}", + instance_id, + start.elapsed() + ); + + let cfg = Arc::new(cfg); + let purge_manager = PurgeManager::new( + cfg.clone(), + instance_id, + memtables.clone(), + pipe_log.clone(), + stats.clone(), + listeners.clone(), + ); + + let (tx, rx) = mpsc::channel(); + let stats_clone = stats.clone(); + let memtables_clone = memtables.clone(); + let metrics_flusher = ThreadBuilder::new() + .name("re-metrics".into()) + .spawn(move || loop { + stats_clone.flush_metrics(); + memtables_clone.flush_metrics(); + if rx.recv_timeout(METRICS_FLUSH_INTERVAL).is_ok() { + break; + } + })?; + + Ok(Self { + cfg, + listeners, + instance_id, + stats, + memtables, + pipe_log, + purge_manager, + write_barrier: Default::default(), + tx: Mutex::new(tx), + metrics_flusher: Some(metrics_flusher), + _phantom: PhantomData, + }) + } +} + +impl Engine +where + F: FileSystem, + P: PipeLog, +{ + /// Writes the content of `log_batch` into the engine and returns written + /// bytes. If `sync` is true, the write will be followed by a call to + /// `fdatasync` on the log file. + pub fn write(&self, log_batch: &mut LogBatch, mut sync: bool) -> Result { + if log_batch.is_empty() { + return Ok(0); + } + let start = Instant::now(); + let (len, compression_ratio) = log_batch.finish_populate( + self.cfg.batch_compression_threshold.0 as usize, + self.cfg.compression_level, + )?; + debug_assert!(len > 0); + + let mut attempt_count = 0_u64; + let block_handle = loop { + // Max retry count is limited to `WRITE_MAX_RETRY_TIMES`, that is, 2. + // If the first `append` retry because of NOSPC error, the next `append` + // should success, unless there exists several abnormal cases in the IO device. + // In that case, `Engine::write` must return `Err`. + attempt_count += 1; + let mut writer = Writer::new(log_batch, sync); + // Snapshot and clear the current perf context temporarily, so the write group + // leader will collect the perf context diff later. + let mut perf_context = take_perf_context(); + let before_enter = Instant::now(); + if let Some(mut group) = self.write_barrier.enter(&mut writer) { + let now = Instant::now(); + let _t = StopWatch::new_with(&*ENGINE_WRITE_LEADER_DURATION_HISTOGRAM, now); + for writer in group.iter_mut() { + writer.entered_time = Some(now); + sync |= writer.sync; + let log_batch = writer.mut_payload(); + let res = self.pipe_log.append(LogQueue::Append, log_batch); + writer.set_output(res); + } + perf_context!(log_write_duration).observe_since(now); + if sync { + // As per trait protocol, sync error should be retriable. But we panic anyway to + // save the trouble of propagating it to other group members. + self.pipe_log.sync(LogQueue::Append).expect("pipe::sync()"); + } + // Pass the perf context diff to all the writers. + let diff = get_perf_context(); + for writer in group.iter_mut() { + writer.perf_context_diff = diff.clone(); + } + } + let entered_time = writer.entered_time.unwrap(); + perf_context.write_wait_duration += + entered_time.saturating_duration_since(before_enter); + debug_assert_eq!(writer.perf_context_diff.write_wait_duration, Duration::ZERO); + perf_context += &writer.perf_context_diff; + set_perf_context(perf_context); + // Retry if `writer.finish()` returns a special 'Error::TryAgain', remarking + // that there still exists free space for this `LogBatch`. + match writer.finish() { + Ok(handle) => { + ENGINE_WRITE_PREPROCESS_DURATION_HISTOGRAM + .observe(entered_time.saturating_duration_since(start).as_secs_f64()); + break handle; + } + Err(Error::TryAgain(e)) => { + if attempt_count >= MAX_WRITE_ATTEMPT { + // A special err, we will retry this LogBatch `append` by appending + // this writer to the next write group, and the current write leader + // will not hang on this write and will return timely. + return Err(Error::TryAgain(format!( + "Failed to write logbatch, exceed MAX_WRITE_ATTEMPT: ({MAX_WRITE_ATTEMPT}), err: {e}", + ))); + } + info!("got err: {e}, try to write this LogBatch again"); + } + Err(e) => { + return Err(e); + } + } + }; + let mut now = Instant::now(); + log_batch.finish_write(block_handle); + self.memtables.apply_append_writes(log_batch.drain()); + for listener in &self.listeners { + listener.post_apply_memtables(block_handle.id); + } + let end = Instant::now(); + let apply_duration = end.saturating_duration_since(now); + ENGINE_WRITE_APPLY_DURATION_HISTOGRAM.observe(apply_duration.as_secs_f64()); + perf_context!(apply_duration).observe(apply_duration); + now = end; + ENGINE_WRITE_DURATION_HISTOGRAM.observe(now.saturating_duration_since(start).as_secs_f64()); + ENGINE_WRITE_SIZE_HISTOGRAM.observe(len as f64); + ENGINE_WRITE_COMPRESSION_RATIO_HISTOGRAM.observe(compression_ratio); + Ok(len) + } + + /// Synchronizes the Raft engine. + pub fn sync(&self) -> Result<()> { + self.write(&mut LogBatch::default(), true)?; + Ok(()) + } + + pub fn get_message( + &self, + region_id: u64, + key: &[u8], + ) -> Result> { + let _t = StopWatch::new(&*ENGINE_READ_MESSAGE_DURATION_HISTOGRAM); + if let Some(memtable) = self.memtables.get(region_id) { + if let Some(value) = memtable.read().get(key) { + return Ok(Some(Message::decode(&*value)?)); + } + } + Ok(None) + } + + pub fn get(&self, region_id: u64, key: &[u8]) -> Option> { + let _t = StopWatch::new(&*ENGINE_READ_MESSAGE_DURATION_HISTOGRAM); + if let Some(memtable) = self.memtables.get(region_id) { + return memtable.read().get(key); + } + None + } + + /// Iterates over [start_key, end_key) range of Raft Group key-values and + /// yields messages of the required type. Unparsable items are skipped. + pub fn scan_messages( + &self, + region_id: u64, + start_key: Option<&[u8]>, + end_key: Option<&[u8]>, + reverse: bool, + mut callback: C, + ) -> Result<()> + where + S: Message + Default, + C: FnMut(&[u8], S) -> bool, + { + self.scan_raw_messages(region_id, start_key, end_key, reverse, move |k, raw_v| { + if let Ok(v) = Message::decode(raw_v) { + callback(k, v) + } else { + true + } + }) + } + + /// Iterates over [start_key, end_key) range of Raft Group key-values and + /// yields all key value pairs as bytes. + pub fn scan_raw_messages( + &self, + region_id: u64, + start_key: Option<&[u8]>, + end_key: Option<&[u8]>, + reverse: bool, + callback: C, + ) -> Result<()> + where + C: FnMut(&[u8], &[u8]) -> bool, + { + let _t = StopWatch::new(&*ENGINE_READ_MESSAGE_DURATION_HISTOGRAM); + if let Some(memtable) = self.memtables.get(region_id) { + memtable + .read() + .scan(start_key, end_key, reverse, callback)?; + } + Ok(()) + } + + pub fn get_entry( + &self, + region_id: u64, + log_idx: u64, + ) -> Result> { + let _t = StopWatch::new(&*ENGINE_READ_ENTRY_DURATION_HISTOGRAM); + if let Some(memtable) = self.memtables.get(region_id) { + if let Some(idx) = memtable.read().get_entry(log_idx) { + ENGINE_READ_ENTRY_COUNT_HISTOGRAM.observe(1.0); + return Ok(Some(read_entry_from_file::( + self.instance_id, + self.pipe_log.as_ref(), + &idx, + )?)); + } + } + Ok(None) + } + + /// Purges expired logs files and returns a set of Raft group ids that need + /// to be compacted. + pub fn purge_expired_files(&self) -> Result> { + self.purge_manager.purge_expired_files() + } + + /// Returns count of fetched entries. + pub fn fetch_entries_to( + &self, + region_id: u64, + begin: u64, + end: u64, + max_size: Option, + vec: &mut Vec, + ) -> Result { + let _t = StopWatch::new(&*ENGINE_READ_ENTRY_DURATION_HISTOGRAM); + if let Some(memtable) = self.memtables.get(region_id) { + let mut ents_idx: Vec = Vec::with_capacity((end - begin) as usize); + memtable + .read() + .fetch_entries_to(begin, end, max_size, &mut ents_idx)?; + for i in ents_idx.iter() { + vec.push(read_entry_from_file::( + self.instance_id, + self.pipe_log.as_ref(), + i, + )?); + } + ENGINE_READ_ENTRY_COUNT_HISTOGRAM.observe(ents_idx.len() as f64); + return Ok(ents_idx.len()); + } + Ok(0) + } + + pub fn first_index(&self, region_id: u64) -> Option { + if let Some(memtable) = self.memtables.get(region_id) { + return memtable.read().first_index(); + } + None + } + + pub fn last_index(&self, region_id: u64) -> Option { + if let Some(memtable) = self.memtables.get(region_id) { + return memtable.read().last_index(); + } + None + } + + /// Deletes log entries before `index` in the specified Raft group. Returns + /// the number of deleted entries. + pub fn compact_to(&self, region_id: u64, index: u64) -> u64 { + let first_index = match self.first_index(region_id) { + Some(index) => index, + None => return 0, + }; + + let mut log_batch = LogBatch::default(); + log_batch.add_command(region_id, Command::Compact { index }); + if let Err(e) = self.write(&mut log_batch, false) { + error!("Failed to write Compact command: {e}"); + } + + self.first_index(region_id).unwrap_or(index) - first_index + } + + pub fn raft_groups(&self) -> Vec { + self.memtables.fold(vec![], |mut v, m| { + v.push(m.region_id()); + v + }) + } + + /// Returns `true` if the engine contains no Raft Group. Empty Raft Group + /// that isn't cleaned is counted as well. + pub fn is_empty(&self) -> bool { + self.memtables.is_empty() + } + + /// Returns the sequence number range of active log files in the specific + /// log queue. + /// For testing only. + pub fn file_span(&self, queue: LogQueue) -> (u64, u64) { + self.pipe_log.file_span(queue) + } + + pub fn get_used_size(&self) -> usize { + self.pipe_log.total_size(LogQueue::Append) + self.pipe_log.total_size(LogQueue::Rewrite) + } + + pub fn path(&self) -> &str { + self.cfg.dir.as_str() + } + + #[cfg(feature = "internals")] + pub fn purge_manager(&self) -> &PurgeManager

{ + &self.purge_manager + } +} + +impl Drop for Engine +where + F: FileSystem, + P: PipeLog, +{ + fn drop(&mut self) { + self.tx.lock().unwrap().send(()).unwrap(); + if let Some(t) = self.metrics_flusher.take() { + t.join().unwrap(); + } + } +} + +impl Engine> { + pub fn consistency_check(path: &Path) -> Result> { + Self::consistency_check_with_file_system(path, Arc::new(DefaultFileSystem)) + } + + #[cfg(feature = "scripting")] + pub fn unsafe_repair(path: &Path, queue: Option, script: String) -> Result<()> { + Self::unsafe_repair_with_file_system(path, queue, script, Arc::new(DefaultFileSystem)) + } + + pub fn dump(path: &Path) -> Result> { + Self::dump_with_file_system(path, Arc::new(DefaultFileSystem)) + } +} + +impl Engine> +where + F: FileSystem, +{ + /// Returns a list of corrupted Raft groups, including their ids and last + /// valid log index. Head or tail corruption cannot be detected. + pub fn consistency_check_with_file_system( + path: &Path, + file_system: Arc, + ) -> Result> { + if !path.exists() { + return Err(Error::InvalidArgument(format!( + "raft-engine directory '{}' does not exist.", + path.to_str().unwrap() + ))); + } + + let cfg = Config { + dir: path.to_str().unwrap().to_owned(), + recovery_mode: RecoveryMode::TolerateAnyCorruption, + ..Default::default() + }; + let mut builder = FilePipeLogBuilder::new(cfg, file_system, Vec::new()); + builder.scan()?; + let (append, rewrite) = + builder.recover(&DefaultMachineFactory::::default())?; + let mut map = rewrite.finish(); + for (id, index) in append.finish() { + map.entry(id).or_insert(index); + } + let mut list: Vec<(u64, u64)> = map.into_iter().collect(); + list.sort_unstable(); + Ok(list) + } + + #[cfg(feature = "scripting")] + pub fn unsafe_repair_with_file_system( + path: &Path, + queue: Option, + script: String, + file_system: Arc, + ) -> Result<()> { + use crate::file_pipe_log::{RecoveryConfig, ReplayMachine}; + + if !path.exists() { + return Err(Error::InvalidArgument(format!( + "raft-engine directory '{}' does not exist.", + path.to_str().unwrap() + ))); + } + + let cfg = Config { + dir: path.to_str().unwrap().to_owned(), + recovery_mode: RecoveryMode::TolerateAnyCorruption, + ..Default::default() + }; + let recovery_mode = cfg.recovery_mode; + let read_block_size = cfg.recovery_read_block_size.0; + let mut builder = FilePipeLogBuilder::new(cfg, file_system.clone(), Vec::new()); + builder.scan()?; + let factory = crate::filter::RhaiFilterMachineFactory::from_script(script); + let mut machine = None; + if queue.is_none() || queue.unwrap() == LogQueue::Append { + machine = Some(builder.recover_queue( + file_system.clone(), + RecoveryConfig { + queue: LogQueue::Append, + mode: recovery_mode, + concurrency: 1, + read_block_size, + }, + &factory, + )?); + } + if queue.is_none() || queue.unwrap() == LogQueue::Rewrite { + let machine2 = builder.recover_queue( + file_system.clone(), + RecoveryConfig { + queue: LogQueue::Rewrite, + mode: recovery_mode, + concurrency: 1, + read_block_size, + }, + &factory, + )?; + if let Some(machine) = &mut machine { + machine.merge(machine2, LogQueue::Rewrite)?; + } + } + if let Some(machine) = machine { + machine.finish(file_system.as_ref(), path)?; + } + Ok(()) + } + + /// Dumps all operations. + pub fn dump_with_file_system(path: &Path, file_system: Arc) -> Result> { + if !path.exists() { + return Err(Error::InvalidArgument(format!( + "raft-engine directory or file '{}' does not exist.", + path.to_str().unwrap() + ))); + } + + if path.is_dir() { + LogItemReader::new_directory_reader(file_system, path) + } else { + LogItemReader::new_file_reader(file_system, path) + } + } +} + +struct BlockCache { + key: Cell<(u64, FileBlockHandle)>, + block: RefCell>, +} + +impl BlockCache { + fn new() -> Self { + BlockCache { + key: Cell::new((0, FileBlockHandle::dummy(LogQueue::Append))), + block: RefCell::new(Vec::new()), + } + } + + fn insert(&self, instance_id: u64, key: FileBlockHandle, block: Vec) { + self.key.set((instance_id, key)); + self.block.replace(block); + } +} + +thread_local! { + static BLOCK_CACHE: BlockCache = BlockCache::new(); +} + +pub(crate) fn read_entry_from_file( + instance_id: u64, + pipe_log: &P, + idx: &EntryIndex, +) -> Result +where + M: MessageExt, + P: PipeLog, +{ + BLOCK_CACHE.with(|cache| { + let entries = idx.entries.unwrap(); + if cache.key.get() != (instance_id, entries) { + cache.insert( + instance_id, + entries, + LogBatch::decode_entries_block( + &pipe_log.read_bytes(entries)?, + entries, + idx.compression_type, + )?, + ); + } + let e = Message::decode( + &cache.block.borrow() + [idx.entry_offset as usize..(idx.entry_offset + idx.entry_len) as usize], + )?; + assert_eq!(M::index(&e), idx.index); + Ok(e) + }) +} + +pub(crate) fn read_entry_bytes_from_file

( + instance_id: u64, + pipe_log: &P, + idx: &EntryIndex, +) -> Result> +where + P: PipeLog, +{ + BLOCK_CACHE.with(|cache| { + let entries = idx.entries.unwrap(); + if cache.key.get() != (instance_id, entries) { + cache.insert( + instance_id, + entries, + LogBatch::decode_entries_block( + &pipe_log.read_bytes(entries)?, + entries, + idx.compression_type, + )?, + ); + } + Ok(cache.block.borrow() + [idx.entry_offset as usize..(idx.entry_offset + idx.entry_len) as usize] + .to_owned()) + }) +} + +#[cfg(test)] +pub(crate) mod tests { + use std::collections::{BTreeSet, HashSet}; + use std::fs::OpenOptions; + use std::path::PathBuf; + + use raft::eraftpb::Entry; + + use super::*; + use crate::env::{ObfuscatedFileSystem, Permission}; + use crate::file_pipe_log::{parse_reserved_file_name, FileNameExt}; + use crate::internals::FileId; + use crate::log_batch::AtomicGroupBuilder; + use crate::pipe_log::Version; + use crate::test_util::{generate_entries, PanicGuard}; + use crate::util::ReadableSize; + + #[derive(Clone, PartialEq, ::prost::Message)] + struct RaftLocalState { + #[prost(uint64, tag = "1")] + last_index: u64, + } + + pub(crate) type RaftLogEngine = Engine; + impl RaftLogEngine { + fn append(&self, rid: u64, start_index: u64, end_index: u64, data: Option<&[u8]>) { + let entries = generate_entries(start_index, end_index, data); + if !entries.is_empty() { + let mut batch = LogBatch::default(); + batch.add_entries::(rid, &entries).unwrap(); + batch + .put_message( + rid, + b"last_index".to_vec(), + &RaftLocalState { + last_index: entries[entries.len() - 1].index, + ..Default::default() + }, + ) + .unwrap(); + self.write(&mut batch, true).unwrap(); + } + } + + fn clean(&self, rid: u64) { + let mut log_batch = LogBatch::default(); + log_batch.add_command(rid, Command::Clean); + self.write(&mut log_batch, true).unwrap(); + } + + fn decode_last_index(&self, rid: u64) -> Option { + self.get_message::(rid, b"last_index") + .unwrap() + .map(|s| s.last_index) + } + + fn reopen(self) -> Self { + let cfg: Config = self.cfg.as_ref().clone(); + let file_system = self.pipe_log.file_system(); + let mut listeners = self.listeners.clone(); + listeners.pop(); + drop(self); + RaftLogEngine::open_with(cfg, file_system, listeners).unwrap() + } + + fn scan_entries( + &self, + rid: u64, + start: u64, + end: u64, + reader: FR, + ) { + let mut entries = Vec::new(); + self.fetch_entries_to::( + rid, + self.first_index(rid).unwrap(), + self.last_index(rid).unwrap() + 1, + None, + &mut entries, + ) + .unwrap(); + assert_eq!(entries.first().unwrap().index, start, "{rid}"); + assert_eq!(entries.last().unwrap().index + 1, end); + assert_eq!( + entries.last().unwrap().index, + self.decode_last_index(rid).unwrap() + ); + assert_eq!(entries.len(), (end - start) as usize); + for e in entries.iter() { + let entry_index = self + .memtables + .get(rid) + .unwrap() + .read() + .get_entry(e.index) + .unwrap(); + assert_eq!(&self.get_entry::(rid, e.index).unwrap().unwrap(), e); + reader(e.index, entry_index.entries.unwrap().id.queue, &e.data); + } + } + + fn file_count(&self, queue: Option) -> usize { + if let Some(queue) = queue { + let (a, b) = self.file_span(queue); + (b - a + 1) as usize + } else { + self.file_count(Some(LogQueue::Append)) + self.file_count(Some(LogQueue::Rewrite)) + } + } + } + + #[test] + fn test_empty_engine() { + let dir = tempfile::Builder::new() + .prefix("test_empty_engine") + .tempdir() + .unwrap(); + let mut sub_dir = PathBuf::from(dir.as_ref()); + sub_dir.push("raft-engine"); + let cfg = Config { + dir: sub_dir.to_str().unwrap().to_owned(), + ..Default::default() + }; + RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) + .unwrap(); + } + + #[test] + fn test_get_entry() { + let normal_batch_size = 10; + let compressed_batch_size = 5120; + for &entry_size in &[normal_batch_size, compressed_batch_size] { + let dir = tempfile::Builder::new() + .prefix("test_get_entry") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(1), + ..Default::default() + }; + + let engine = RaftLogEngine::open_with_file_system( + cfg.clone(), + Arc::new(ObfuscatedFileSystem::default()), + ) + .unwrap(); + assert_eq!(engine.path(), dir.path().to_str().unwrap()); + let data = vec![b'x'; entry_size]; + for i in 10..20 { + let rid = i; + let index = i; + engine.append(rid, index, index + 2, Some(&data)); + } + for i in 10..20 { + let rid = i; + let index = i; + engine.scan_entries(rid, index, index + 2, |_, q, d| { + assert_eq!(q, LogQueue::Append); + assert_eq!(d, &data); + }); + } + + // Recover the engine. + let engine = engine.reopen(); + for i in 10..20 { + let rid = i; + let index = i; + engine.scan_entries(rid, index, index + 2, |_, q, d| { + assert_eq!(q, LogQueue::Append); + assert_eq!(d, &data); + }); + } + } + } + + #[test] + fn test_clean_raft_group() { + fn run_steps(steps: &[Option<(u64, u64)>]) { + let rid = 1; + let data = vec![b'x'; 1024]; + + for rewrite_step in 1..=steps.len() { + for exit_purge in [None, Some(1), Some(2)] { + let _guard = PanicGuard::with_prompt(format!( + "case: [{steps:?}, {rewrite_step}, {exit_purge:?}]", + )); + let dir = tempfile::Builder::new() + .prefix("test_clean_raft_group") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(1), + ..Default::default() + }; + let engine = RaftLogEngine::open_with_file_system( + cfg.clone(), + Arc::new(ObfuscatedFileSystem::default()), + ) + .unwrap(); + + for (i, step) in steps.iter().enumerate() { + if let Some((start, end)) = *step { + engine.append(rid, start, end, Some(&data)); + } else { + engine.clean(rid); + } + if i + 1 == rewrite_step { + engine + .purge_manager + .must_rewrite_append_queue(None, exit_purge); + } + } + + let engine = engine.reopen(); + if let Some((start, end)) = *steps.last().unwrap() { + engine.scan_entries(rid, start, end, |_, _, d| { + assert_eq!(d, &data); + }); + } else { + assert!(engine.raft_groups().is_empty()); + } + + engine.purge_manager.must_rewrite_append_queue(None, None); + let engine = engine.reopen(); + if let Some((start, end)) = *steps.last().unwrap() { + engine.scan_entries(rid, start, end, |_, _, d| { + assert_eq!(d, &data); + }); + } else { + assert!(engine.raft_groups().is_empty()); + } + } + } + } + + run_steps(&[Some((1, 5)), None, Some((2, 6)), None, Some((3, 7)), None]); + run_steps(&[Some((1, 5)), None, Some((2, 6)), None, Some((3, 7))]); + run_steps(&[Some((1, 5)), None, Some((2, 6)), None]); + run_steps(&[Some((1, 5)), None, Some((2, 6))]); + run_steps(&[Some((1, 5)), None]); + } + + #[test] + fn test_key_value_scan() { + fn key(i: u64) -> Vec { + format!("k{i}").as_bytes().to_vec() + } + fn value(i: u64) -> Vec { + format!("v{i}").as_bytes().to_vec() + } + fn rich_value(i: u64) -> RaftLocalState { + RaftLocalState { + last_index: i, + ..Default::default() + } + } + + let dir = tempfile::Builder::new() + .prefix("test_key_value_scan") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(1), + ..Default::default() + }; + let rid = 1; + let engine = + RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) + .unwrap(); + + engine + .scan_messages::(rid, None, None, false, |_, _| { + panic!("unexpected message."); + }) + .unwrap(); + + let mut batch = LogBatch::default(); + let mut res = Vec::new(); + let mut rich_res = Vec::new(); + batch.put(rid, key(1), value(1)).unwrap(); + batch.put(rid, key(2), value(2)).unwrap(); + batch.put(rid, key(3), value(3)).unwrap(); + engine.write(&mut batch, false).unwrap(); + + engine + .scan_raw_messages(rid, None, None, false, |k, v| { + res.push((k.to_vec(), v.to_vec())); + true + }) + .unwrap(); + assert_eq!( + res, + vec![(key(1), value(1)), (key(2), value(2)), (key(3), value(3))] + ); + res.clear(); + engine + .scan_raw_messages(rid, None, None, true, |k, v| { + res.push((k.to_vec(), v.to_vec())); + true + }) + .unwrap(); + assert_eq!( + res, + vec![(key(3), value(3)), (key(2), value(2)), (key(1), value(1))] + ); + res.clear(); + engine + .scan_messages::(rid, None, None, false, |_, _| { + panic!("unexpected message.") + }) + .unwrap(); + + batch.put_message(rid, key(22), &rich_value(22)).unwrap(); + batch.put_message(rid, key(33), &rich_value(33)).unwrap(); + engine.write(&mut batch, false).unwrap(); + + engine + .scan_messages(rid, None, None, false, |k, v| { + rich_res.push((k.to_vec(), v)); + false + }) + .unwrap(); + assert_eq!(rich_res, vec![(key(22), rich_value(22))]); + rich_res.clear(); + engine + .scan_messages(rid, None, None, true, |k, v| { + rich_res.push((k.to_vec(), v)); + false + }) + .unwrap(); + assert_eq!(rich_res, vec![(key(33), rich_value(33))]); + rich_res.clear(); + } + + #[test] + fn test_delete_key_value() { + let dir = tempfile::Builder::new() + .prefix("test_delete_key_value") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(1), + ..Default::default() + }; + let rid = 1; + let key = b"key".to_vec(); + let (v1, v2) = (b"v1".to_vec(), b"v2".to_vec()); + let mut batch_1 = LogBatch::default(); + batch_1.put(rid, key.clone(), v1).unwrap(); + let mut batch_2 = LogBatch::default(); + batch_2.put(rid, key.clone(), v2.clone()).unwrap(); + let mut delete_batch = LogBatch::default(); + delete_batch.delete(rid, key.clone()); + + let engine = + RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) + .unwrap(); + assert_eq!( + engine.get_message::(rid, &key).unwrap(), + None + ); + assert_eq!(engine.get(rid, &key), None); + + // put | delete + // ^ rewrite + engine.write(&mut batch_1.clone(), true).unwrap(); + assert!(engine.get_message::(rid, &key).is_err()); + engine.purge_manager.must_rewrite_append_queue(None, None); + engine.write(&mut delete_batch.clone(), true).unwrap(); + let engine = engine.reopen(); + assert_eq!(engine.get(rid, &key), None); + assert_eq!( + engine.get_message::(rid, &key).unwrap(), + None + ); + + // Incomplete purge. + engine.write(&mut batch_1.clone(), true).unwrap(); + engine + .purge_manager + .must_rewrite_append_queue(None, Some(2)); + engine.write(&mut delete_batch.clone(), true).unwrap(); + let engine = engine.reopen(); + assert_eq!(engine.get(rid, &key), None); + + // TODO: Preserve kv tombstone during rewrite and activate this test case. + // put | delete | + // ^ rewrite + // let engine = engine.reopen(); + // engine.write(&mut batch_1.clone(), true).unwrap(); + // engine.write(&mut delete_batch.clone(), true).unwrap(); + // engine.purge_manager.must_rewrite_append_queue(None, None); + // let engine = engine.reopen(); + // assert_eq!(engine.get(rid, &key), None); + + // put | delete | put + // ^ rewrite + let engine = engine.reopen(); + engine.write(&mut batch_1.clone(), true).unwrap(); + engine.purge_manager.must_rewrite_append_queue(None, None); + engine.write(&mut delete_batch.clone(), true).unwrap(); + engine.write(&mut batch_2.clone(), true).unwrap(); + let engine = engine.reopen(); + assert_eq!(engine.get(rid, &key).unwrap(), v2); + // Incomplete purge. + engine.write(&mut batch_1.clone(), true).unwrap(); + engine + .purge_manager + .must_rewrite_append_queue(None, Some(2)); + engine.write(&mut delete_batch.clone(), true).unwrap(); + engine.write(&mut batch_2.clone(), true).unwrap(); + let engine = engine.reopen(); + assert_eq!(engine.get(rid, &key).unwrap(), v2); + + // put | delete | put + // ^ rewrite + let engine = engine.reopen(); + engine.write(&mut batch_1.clone(), true).unwrap(); + engine.write(&mut delete_batch.clone(), true).unwrap(); + engine.purge_manager.must_rewrite_append_queue(None, None); + engine.write(&mut batch_2.clone(), true).unwrap(); + let engine = engine.reopen(); + assert_eq!(engine.get(rid, &key).unwrap(), v2); + // Incomplete purge. + engine.write(&mut batch_1.clone(), true).unwrap(); + engine.write(&mut delete_batch.clone(), true).unwrap(); + engine + .purge_manager + .must_rewrite_append_queue(None, Some(2)); + engine.write(&mut batch_2.clone(), true).unwrap(); + let engine = engine.reopen(); + assert_eq!(engine.get(rid, &key).unwrap(), v2); + + // put | delete | put | + // ^ rewrite + let engine = engine.reopen(); + engine.write(&mut batch_1.clone(), true).unwrap(); + engine.write(&mut delete_batch.clone(), true).unwrap(); + engine.write(&mut batch_2.clone(), true).unwrap(); + engine.purge_manager.must_rewrite_append_queue(None, None); + let engine = engine.reopen(); + assert_eq!(engine.get(rid, &key).unwrap(), v2); + // Incomplete purge. + let engine = engine.reopen(); + engine.write(&mut batch_1.clone(), true).unwrap(); + engine.write(&mut delete_batch.clone(), true).unwrap(); + engine.write(&mut batch_2.clone(), true).unwrap(); + engine + .purge_manager + .must_rewrite_append_queue(None, Some(2)); + let engine = engine.reopen(); + assert_eq!(engine.get(rid, &key).unwrap(), v2); + } + + #[test] + fn test_compact_raft_group() { + let dir = tempfile::Builder::new() + .prefix("test_compact_raft_group") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(1), + ..Default::default() + }; + let engine = + RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) + .unwrap(); + let data = vec![b'x'; 1024]; + + // rewrite:[1 ..10] + // append: [5..10] + let mut rid = 7; + engine.append(rid, 1, 10, Some(&data)); + // Files are not purged. + engine + .purge_manager + .must_rewrite_append_queue(None, Some(2)); + let mut compact_log = LogBatch::default(); + compact_log.add_command(rid, Command::Compact { index: 5 }); + engine.write(&mut compact_log, true).unwrap(); + let engine = engine.reopen(); + engine.scan_entries(rid, 5, 10, |_, q, d| { + assert_eq!(q, LogQueue::Append); + assert_eq!(d, &data); + }); + assert_eq!(engine.stats.live_entries(LogQueue::Append), 6); // 5 entries + 1 kv + + // rewrite: [20..25] + // append: [10 ..25] + rid += 1; + engine.append(rid, 5, 15, Some(&data)); + let mut compact_log = LogBatch::default(); + compact_log.add_command(rid, Command::Compact { index: 10 }); + engine.write(&mut compact_log, true).unwrap(); + engine.append(rid, 15, 25, Some(&data)); + // Files are not purged. + engine + .purge_manager + .must_rewrite_append_queue(None, Some(2)); + // Simulate loss of buffered write. + let mut compact_log = LogBatch::default(); + compact_log.add_command(rid, Command::Compact { index: 20 }); + engine.memtables.apply_append_writes(compact_log.drain()); + engine.purge_manager.must_rewrite_rewrite_queue(); + let engine = engine.reopen(); + engine.scan_entries(rid, 10, 25, |_, q, d| { + assert_eq!(q, LogQueue::Append); + assert_eq!(d, &data); + }); + assert_eq!(engine.stats.live_entries(LogQueue::Append), 22); // 20 entries + 2 kv + engine.clean(rid - 1); + assert_eq!(engine.stats.live_entries(LogQueue::Append), 16); + // rewrite: [20..25][10..25] + // append: [10..25] + engine + .purge_manager + .must_rewrite_append_queue(None, Some(2)); + let engine = engine.reopen(); + engine.scan_entries(rid, 10, 25, |_, q, d| { + assert_eq!(q, LogQueue::Append); + assert_eq!(d, &data); + }); + + // rewrite:[10..15][15 ..25] + // append: [20..25] + rid += 1; + engine.append(rid, 5, 15, Some(&data)); + let mut compact_log = LogBatch::default(); + compact_log.add_command(rid, Command::Compact { index: 10 }); + engine.write(&mut compact_log, true).unwrap(); + engine.purge_manager.must_rewrite_append_queue(None, None); + engine.append(rid, 15, 25, Some(&data)); + engine + .purge_manager + .must_rewrite_append_queue(None, Some(2)); + let mut compact_log = LogBatch::default(); + compact_log.add_command(rid, Command::Compact { index: 20 }); + engine.write(&mut compact_log, true).unwrap(); + let engine = engine.reopen(); + engine.scan_entries(rid, 20, 25, |_, q, d| { + assert_eq!(q, LogQueue::Append); + assert_eq!(d, &data); + }); + + // rewrite:[1..5] [10..15] + // append: [10..15] + rid += 1; + engine.append(rid, 1, 5, Some(&data)); + engine.purge_manager.must_rewrite_append_queue(None, None); + engine.append(rid, 5, 15, Some(&data)); + let mut compact_log = LogBatch::default(); + compact_log.add_command(rid, Command::Compact { index: 10 }); + engine.write(&mut compact_log, true).unwrap(); + // Files are not purged. + engine + .purge_manager + .must_rewrite_append_queue(None, Some(2)); + let engine = engine.reopen(); + engine.scan_entries(rid, 10, 15, |_, q, d| { + assert_eq!(q, LogQueue::Append); + assert_eq!(d, &data); + }); + } + + #[test] + fn test_purge_triggered_by_compact() { + let dir = tempfile::Builder::new() + .prefix("test_purge_triggered_by_compact") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize::kb(5), + purge_threshold: ReadableSize::kb(150), + ..Default::default() + }; + + let engine = + RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) + .unwrap(); + let data = vec![b'x'; 1024]; + for index in 0..100 { + engine.append(1, index, index + 1, Some(&data)); + } + + // GC all log entries. Won't trigger purge because total size is not enough. + let count = engine.compact_to(1, 100); + assert_eq!(count, 100); + assert!(!engine + .purge_manager + .needs_rewrite_log_files(LogQueue::Append)); + + // Append more logs to make total size greater than `purge_threshold`. + for index in 100..250 { + engine.append(1, index, index + 1, Some(&data)); + } + + // GC first 101 log entries. + assert_eq!(engine.compact_to(1, 101), 1); + // Needs to purge because the total size is greater than `purge_threshold`. + assert!(engine + .purge_manager + .needs_rewrite_log_files(LogQueue::Append)); + + let old_min_file_seq = engine.file_span(LogQueue::Append).0; + let will_force_compact = engine.purge_expired_files().unwrap(); + let new_min_file_seq = engine.file_span(LogQueue::Append).0; + // Some entries are rewritten. + assert!(new_min_file_seq > old_min_file_seq); + // No regions need to be force compacted because the threshold is not reached. + assert!(will_force_compact.is_empty()); + // After purge, entries and raft state are still available. + assert!(engine.get_entry::(1, 101).unwrap().is_some()); + + assert_eq!(engine.compact_to(1, 102), 1); + // Needs to purge because the total size is greater than `purge_threshold`. + assert!(engine + .purge_manager + .needs_rewrite_log_files(LogQueue::Append)); + let will_force_compact = engine.purge_expired_files().unwrap(); + // The region needs to be force compacted because the threshold is reached. + assert!(!will_force_compact.is_empty()); + assert_eq!(will_force_compact[0], 1); + } + + #[test] + fn test_purge_trigger_force_rewrite() { + let dir = tempfile::Builder::new() + .prefix("test_purge_trigger_force_write") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize::kb(1), + purge_threshold: ReadableSize::kb(10), + ..Default::default() + }; + + let engine = + RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) + .unwrap(); + let data = vec![b'x'; 1024]; + // write 50 small entries into region 1~3, it should trigger force compact. + for rid in 1..=3 { + for index in 0..50 { + engine.append(rid, index, index + 1, Some(&data[..10])); + } + } + // write some small entries to trigger purge. + for rid in 4..=50 { + engine.append(rid, 1, 2, Some(&data)); + } + + let check_purge = |pending_regions: Vec| { + let mut compact_regions = engine.purge_expired_files().unwrap(); + // sort key in order. + compact_regions.sort_unstable(); + assert_eq!(compact_regions, pending_regions); + }; + + for _ in 0..9 { + check_purge(vec![1, 2, 3]); + } + + // 10th, rewritten, but still needs to be compacted. + check_purge(vec![1, 2, 3]); + for rid in 1..=3 { + let memtable = engine.memtables.get(rid).unwrap(); + assert_eq!(memtable.read().rewrite_count(), 50); + } + + // compact and write some new data to trigger compact again. + for rid in 2..=50 { + let last_idx = engine.last_index(rid).unwrap(); + engine.compact_to(rid, last_idx); + engine.append(rid, last_idx, last_idx + 1, Some(&data)); + } + // after write, region 1 can trigger compact again. + check_purge(vec![1]); + } + + #[test] + fn test_rewrite_and_recover() { + let dir = tempfile::Builder::new() + .prefix("test_rewrite_and_recover") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize::kb(5), + purge_threshold: ReadableSize::kb(80), + ..Default::default() + }; + let engine = + RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) + .unwrap(); + let data = vec![b'x'; 1024]; + + // Put 100 entries into 10 regions. + for index in 1..=10 { + for rid in 1..=10 { + engine.append(rid, index, index + 1, Some(&data)); + } + } + engine.append(11, 1, 11, Some(&data)); + + // The engine needs purge, and all old entries should be rewritten. + assert!(engine + .purge_manager + .needs_rewrite_log_files(LogQueue::Append)); + assert!(engine.purge_expired_files().unwrap().is_empty()); + assert!(engine.file_span(LogQueue::Append).0 > 1); + + let rewrite_file_size = engine.pipe_log.total_size(LogQueue::Rewrite); + assert!(rewrite_file_size > 59); // The rewrite queue isn't empty. + + // All entries should be available. + for rid in 1..=10 { + engine.scan_entries(rid, 1, 11, |_, _, d| { + assert_eq!(d, &data); + }); + } + + engine.clean(11); + let cleaned_region_ids = engine.memtables.cleaned_region_ids(); + assert_eq!(cleaned_region_ids.len(), 1); + + let engine = engine.reopen(); + assert_eq!(engine.memtables.cleaned_region_ids(), cleaned_region_ids); + + for rid in 1..=10 { + engine.scan_entries(rid, 1, 11, |_, _, d| { + assert_eq!(d, &data); + }); + } + + // Rewrite again to check the rewrite queue is healthy. + for index in 11..=20 { + for rid in 1..=10 { + engine.append(rid, index, index + 1, Some(&data)); + } + } + + assert!(engine + .purge_manager + .needs_rewrite_log_files(LogQueue::Append)); + assert!(engine.purge_expired_files().unwrap().is_empty()); + } + + #[test] + fn test_empty_protobuf_message() { + let dir = tempfile::Builder::new() + .prefix("test_empty_protobuf_message") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + ..Default::default() + }; + let engine = + RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) + .unwrap(); + + let mut log_batch = LogBatch::default(); + let empty_entry = Entry::default(); + assert_eq!(empty_entry.encoded_len(), 0); + log_batch + .add_entries::(0, &[empty_entry.clone()]) + .unwrap(); + engine.write(&mut log_batch, false).unwrap(); + let empty_state = RaftLocalState::default(); + assert_eq!(empty_state.encoded_len(), 0); + log_batch + .put_message(1, b"key".to_vec(), &empty_state) + .unwrap(); + engine.write(&mut log_batch, false).unwrap(); + log_batch + .add_entries::(2, &[empty_entry.clone()]) + .unwrap(); + log_batch + .put_message(2, b"key".to_vec(), &empty_state) + .unwrap(); + engine.write(&mut log_batch, true).unwrap(); + + let engine = engine.reopen(); + assert_eq!( + engine.get_entry::(0, 0).unwrap().unwrap(), + empty_entry + ); + assert_eq!( + engine.get_entry::(2, 0).unwrap().unwrap(), + empty_entry + ); + assert_eq!( + engine + .get_message::(1, b"key") + .unwrap() + .unwrap(), + empty_state + ); + assert_eq!( + engine + .get_message::(2, b"key") + .unwrap() + .unwrap(), + empty_state + ); + } + + #[test] + fn test_empty_batch() { + let dir = tempfile::Builder::new() + .prefix("test_empty_batch") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + ..Default::default() + }; + let engine = + RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) + .unwrap(); + let data = vec![b'x'; 16]; + let cases = [[false, false], [false, true], [true, true]]; + for (i, writes) in cases.iter().enumerate() { + let rid = i as u64; + let mut batch = LogBatch::default(); + for &has_data in writes { + if has_data { + batch.put(rid, b"key".to_vec(), data.clone()).unwrap(); + } + engine.write(&mut batch, true).unwrap(); + assert!(batch.is_empty()); + } + } + } + + #[test] + fn test_dirty_recovery() { + let dir = tempfile::Builder::new() + .prefix("test_dirty_recovery") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + ..Default::default() + }; + let engine = + RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) + .unwrap(); + let data = vec![b'x'; 1024]; + + for rid in 1..21 { + engine.append(rid, 1, 21, Some(&data)); + } + + // Create an unrelated sub-directory. + std::fs::create_dir(dir.path().join(Path::new("random_dir"))).unwrap(); + // Create an unrelated file. + let _f = std::fs::File::create(dir.path().join(Path::new("random_file"))).unwrap(); + + let engine = engine.reopen(); + for rid in 1..21 { + engine.scan_entries(rid, 1, 21, |_, _, d| { + assert_eq!(d, &data); + }); + } + } + + #[test] + fn test_large_rewrite_batch() { + let dir = tempfile::Builder::new() + .prefix("test_large_rewrite_batch") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(1), + ..Default::default() + }; + let engine = + RaftLogEngine::open_with_file_system(cfg, Arc::new(ObfuscatedFileSystem::default())) + .unwrap(); + let data = vec![b'x'; 2 * 1024 * 1024]; + + for rid in 1..=3 { + engine.append(rid, 1, 11, Some(&data)); + } + + let old_active_file = engine.file_span(LogQueue::Append).1; + engine.purge_manager.must_rewrite_append_queue(None, None); + assert_eq!(engine.file_span(LogQueue::Append).0, old_active_file + 1); + let old_active_file = engine.file_span(LogQueue::Rewrite).1; + engine.purge_manager.must_rewrite_rewrite_queue(); + assert!(engine.file_span(LogQueue::Rewrite).0 > old_active_file); + + for rid in engine.raft_groups() { + let mut total = 0; + engine + .scan_raw_messages(rid, None, None, false, |k, _| { + assert!(!crate::is_internal_key(k, None)); + total += 1; + true + }) + .unwrap(); + assert_eq!(total, 1); + } + assert_eq!(engine.raft_groups().len(), 3); + + let engine = engine.reopen(); + for rid in 1..=3 { + engine.scan_entries(rid, 1, 11, |_, _, d| { + assert_eq!(d, &data); + }); + } + } + + #[test] + fn test_combination_of_version_and_recycle() { + fn test_engine_ops(cfg_v1: &Config, cfg_v2: &Config) { + let rid = 1; + let data = vec![b'7'; 1024]; + { + // open engine with format_version - Version::V1 + let engine = RaftLogEngine::open(cfg_v1.clone()).unwrap(); + engine.append(rid, 0, 20, Some(&data)); + let append_first = engine.file_span(LogQueue::Append).0; + engine.compact_to(rid, 18); + engine.purge_expired_files().unwrap(); + assert!(engine.file_span(LogQueue::Append).0 > append_first); + assert_eq!(engine.first_index(rid).unwrap(), 18); + assert_eq!(engine.last_index(rid).unwrap(), 19); + } + { + // open engine with format_version - Version::V2 + let engine = RaftLogEngine::open(cfg_v2.clone()).unwrap(); + assert_eq!(engine.first_index(rid).unwrap(), 18); + assert_eq!(engine.last_index(rid).unwrap(), 19); + engine.append(rid, 20, 40, Some(&data)); + let append_first = engine.file_span(LogQueue::Append).0; + engine.compact_to(rid, 38); + engine.purge_expired_files().unwrap(); + assert!(engine.file_span(LogQueue::Append).0 > append_first); + assert_eq!(engine.first_index(rid).unwrap(), 38); + assert_eq!(engine.last_index(rid).unwrap(), 39); + } + { + // reopen engine with format_version - Version::V1 + let engine = RaftLogEngine::open(cfg_v1.clone()).unwrap(); + assert_eq!(engine.first_index(rid).unwrap(), 38); + assert_eq!(engine.last_index(rid).unwrap(), 39); + } + } + // test engine on mutable versions + { + let dir = tempfile::Builder::new() + .prefix("test_mutable_format_version") + .tempdir() + .unwrap(); + // config with v1 + let cfg_v1 = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(1), + purge_threshold: ReadableSize(1), + format_version: Version::V1, + enable_log_recycle: false, + ..Default::default() + }; + // config with v2 + let cfg_v2 = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(1), + purge_threshold: ReadableSize(1), + format_version: Version::V2, + enable_log_recycle: false, + ..Default::default() + }; + test_engine_ops(&cfg_v1, &cfg_v2); + } + // test engine when enable_log_recycle == true + { + let dir = tempfile::Builder::new() + .prefix("test_enable_log_recycle") + .tempdir() + .unwrap(); + // config with v1 + let cfg_v1 = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(1), + purge_threshold: ReadableSize(1), + format_version: Version::V1, + enable_log_recycle: false, + ..Default::default() + }; + // config with v2 + let cfg_v2 = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(1), + purge_threshold: ReadableSize(1), + format_version: Version::V2, + enable_log_recycle: true, + prefill_for_recycle: true, + ..Default::default() + }; + test_engine_ops(&cfg_v1, &cfg_v2); + } + } + + /// Test cases related to tools /// + + #[test] + fn test_dump_file_or_directory() { + let dir = tempfile::Builder::new() + .prefix("test_dump_file_or_directory") + .tempdir() + .unwrap(); + let entry_data = vec![b'x'; 1024]; + let fs = Arc::new(ObfuscatedFileSystem::default()); + + let mut batches = vec![vec![LogBatch::default()]]; + let mut batch = LogBatch::default(); + batch + .add_entries::(7, &generate_entries(1, 11, Some(&entry_data))) + .unwrap(); + batch.add_command(7, Command::Clean); + batch.put(7, b"key".to_vec(), b"value".to_vec()).unwrap(); + batch.delete(7, b"key2".to_vec()); + batches.push(vec![batch.clone()]); + let mut batch2 = LogBatch::default(); + batch2.put(8, b"key3".to_vec(), b"value".to_vec()).unwrap(); + batch2 + .add_entries::(8, &generate_entries(5, 15, Some(&entry_data))) + .unwrap(); + batches.push(vec![batch, batch2]); + + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + ..Default::default() + }; + + let engine = RaftLogEngine::open_with_file_system(cfg, fs.clone()).unwrap(); + for bs in batches.iter_mut() { + for batch in bs.iter_mut() { + engine.write(batch, false).unwrap(); + } + + engine.sync().unwrap(); + } + + drop(engine); + // dump dir with raft groups. 8 element in raft groups 7 and 2 elements in raft + // groups 8 + let dump_it = Engine::dump_with_file_system(dir.path(), fs.clone()).unwrap(); + let total = dump_it + .inspect(|i| { + i.as_ref().unwrap(); + }) + .count(); + assert!(total == 10); + + // dump file + let file_id = FileId { + queue: LogQueue::Rewrite, + seq: 1, + }; + let dump_it = Engine::dump_with_file_system( + file_id.build_file_path(dir.path()).as_path(), + fs.clone(), + ) + .unwrap(); + let total = dump_it + .inspect(|i| { + i.as_ref().unwrap(); + }) + .count(); + assert!(0 == total); + + // dump dir that does not exists + assert!(Engine::dump_with_file_system(Path::new("/not_exists_dir"), fs.clone()).is_err()); + + // dump file that does not exists + let mut not_exists_file = PathBuf::from(dir.as_ref()); + not_exists_file.push("not_exists_file"); + assert!(Engine::dump_with_file_system(not_exists_file.as_path(), fs).is_err()); + } + + #[cfg(feature = "scripting")] + #[test] + fn test_repair_default() { + let dir = tempfile::Builder::new() + .prefix("test_repair_default") + .tempdir() + .unwrap(); + let entry_data = vec![b'x'; 128]; + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(1), // Create lots of files. + ..Default::default() + }; + let fs = Arc::new(ObfuscatedFileSystem::default()); + + let engine = RaftLogEngine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + for rid in 1..=50 { + engine.append(rid, 1, 6, Some(&entry_data)); + } + for rid in 25..=50 { + engine.append(rid, 6, 11, Some(&entry_data)); + } + drop(engine); + + let script1 = "".to_owned(); + RaftLogEngine::unsafe_repair_with_file_system( + dir.path(), + None, // queue + script1, + fs.clone(), + ) + .unwrap(); + let script2 = " + fn filter_append(id, first, count, rewrite_count, queue, ifirst, ilast) { + 0 + } + fn filter_compact(id, first, count, rewrite_count, queue, compact_to) { + 0 + } + fn filter_clean(id, first, count, rewrite_count, queue) { + 0 + } + " + .to_owned(); + RaftLogEngine::unsafe_repair_with_file_system( + dir.path(), + None, // queue + script2, + fs.clone(), + ) + .unwrap(); + + let engine = RaftLogEngine::open_with_file_system(cfg, fs).unwrap(); + for rid in 1..25 { + engine.scan_entries(rid, 1, 6, |_, _, d| { + assert_eq!(d, &entry_data); + }); + } + for rid in 25..=50 { + engine.scan_entries(rid, 1, 11, |_, _, d| { + assert_eq!(d, &entry_data); + }); + } + } + + #[cfg(feature = "scripting")] + #[test] + fn test_repair_discard_entries() { + let dir = tempfile::Builder::new() + .prefix("test_repair_discard") + .tempdir() + .unwrap(); + let entry_data = vec![b'x'; 128]; + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(1), // Create lots of files. + ..Default::default() + }; + let fs = Arc::new(ObfuscatedFileSystem::default()); + + let engine = RaftLogEngine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + for rid in 1..=50 { + engine.append(rid, 1, 6, Some(&entry_data)); + } + for rid in 25..=50 { + engine.append(rid, 6, 11, Some(&entry_data)); + } + drop(engine); + + let incoming_emptied = [1, 25]; + let existing_emptied = [2, 26]; + let script = " + fn filter_append(id, first, count, rewrite_count, queue, ifirst, ilast) { + if id == 1 { + return 1; + } else if id == 2 { + return 2; + } else if id == 25 { + return 1; + } else if id == 26 { + return 2; + } + 0 // default + } + " + .to_owned(); + RaftLogEngine::unsafe_repair_with_file_system( + dir.path(), + None, // queue + script, + fs.clone(), + ) + .unwrap(); + + let engine = RaftLogEngine::open_with_file_system(cfg, fs).unwrap(); + for rid in 1..25 { + if existing_emptied.contains(&rid) || incoming_emptied.contains(&rid) { + continue; + } + engine.scan_entries(rid, 1, 6, |_, _, d| { + assert_eq!(d, &entry_data); + }); + } + for rid in 25..=50 { + if existing_emptied.contains(&rid) || incoming_emptied.contains(&rid) { + continue; + } + engine.scan_entries(rid, 1, 11, |_, _, d| { + assert_eq!(d, &entry_data); + }); + } + for rid in existing_emptied { + let first_index = if rid < 25 { 1 } else { 6 }; + let last_index = if rid < 25 { 5 } else { 10 }; + engine.scan_entries(rid, first_index, last_index + 1, |_, _, d| { + assert_eq!(d, &entry_data); + }); + } + for rid in incoming_emptied { + let last_index = if rid < 25 { 5 } else { 10 }; + assert_eq!(engine.first_index(rid), None); + assert_eq!(engine.last_index(rid), None); + assert_eq!(engine.decode_last_index(rid), Some(last_index)); + } + } + + #[test] + fn test_tail_corruption() { + let dir = tempfile::Builder::new() + .prefix("test_tail_corruption") + .tempdir() + .unwrap(); + let entry_data = vec![b'x'; 16]; + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + // One big file. + target_file_size: ReadableSize::gb(10), + ..Default::default() + }; + let fs = Arc::new(ObfuscatedFileSystem::default()); + + let engine = RaftLogEngine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + for rid in 1..=50 { + engine.append(rid, 1, 6, Some(&entry_data)); + } + for rid in 25..=50 { + engine.append(rid, 6, 11, Some(&entry_data)); + } + let (_, last_file_seq) = engine.file_span(LogQueue::Append); + drop(engine); + + let last_file = FileId { + queue: LogQueue::Append, + seq: last_file_seq, + }; + let f = OpenOptions::new() + .write(true) + .open(last_file.build_file_path(dir.path())) + .unwrap(); + + // Corrupt a log batch. + f.set_len(f.metadata().unwrap().len() - 1).unwrap(); + RaftLogEngine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + + // Corrupt the file header. + f.set_len(1).unwrap(); + RaftLogEngine::open_with_file_system(cfg, fs).unwrap(); + } + + #[test] + fn test_reopen_with_wrong_file_system() { + let dir = tempfile::Builder::new() + .prefix("test_reopen_with_wrong_file_system") + .tempdir() + .unwrap(); + let entry_data = vec![b'x'; 128]; + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(1), + ..Default::default() + }; + let fs = Arc::new(ObfuscatedFileSystem::default()); + + let engine = RaftLogEngine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + for rid in 1..=10 { + engine.append(rid, 1, 11, Some(&entry_data)); + } + drop(engine); + + assert!(RaftLogEngine::open(cfg.clone()).is_err()); + + let engine = RaftLogEngine::open_with_file_system(cfg, fs).unwrap(); + for rid in 1..10 { + engine.scan_entries(rid, 1, 11, |_, _, d| { + assert_eq!(d, &entry_data); + }); + } + } + + #[cfg(feature = "nightly")] + #[bench] + fn bench_engine_fetch_entries(b: &mut test::Bencher) { + use rand::{thread_rng, Rng}; + + let dir = tempfile::Builder::new() + .prefix("bench_engine_fetch_entries") + .tempdir() + .unwrap(); + let entry_data = vec![b'x'; 1024]; + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + ..Default::default() + }; + let engine = RaftLogEngine::open(cfg).unwrap(); + for i in 0..10 { + for rid in 1..=100 { + engine.append(rid, 1 + i * 10, 1 + i * 10 + 10, Some(&entry_data)); + } + } + let mut vec: Vec = Vec::new(); + b.iter(move || { + let region_id = thread_rng().gen_range(1..=100); + engine + .fetch_entries_to::(region_id, 1, 101, None, &mut vec) + .unwrap(); + vec.clear(); + }); + } + + #[test] + fn test_engine_is_empty() { + let dir = tempfile::Builder::new() + .prefix("test_engine_is_empty") + .tempdir() + .unwrap(); + let entry_data = vec![b'x'; 128]; + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + ..Default::default() + }; + let fs = Arc::new(ObfuscatedFileSystem::default()); + let rid = 1; + + let engine = RaftLogEngine::open_with_file_system(cfg, fs).unwrap(); + assert!(engine.is_empty()); + engine.append(rid, 1, 11, Some(&entry_data)); + assert!(!engine.is_empty()); + + let mut log_batch = LogBatch::default(); + log_batch.add_command(rid, Command::Compact { index: 11 }); + log_batch.delete(rid, b"last_index".to_vec()); + engine.write(&mut log_batch, true).unwrap(); + assert!(!engine.is_empty()); + + engine.clean(rid); + assert!(engine.is_empty()); + } + + pub struct DeleteMonitoredFileSystem { + inner: ObfuscatedFileSystem, + append_metadata: Mutex>, + reserved_metadata: Mutex>, + } + + impl DeleteMonitoredFileSystem { + fn new() -> Self { + Self { + inner: ObfuscatedFileSystem::default(), + append_metadata: Mutex::new(BTreeSet::new()), + reserved_metadata: Mutex::new(BTreeSet::new()), + } + } + + fn update_metadata(&self, path: &Path, delete: bool) -> bool { + let path = path.file_name().unwrap().to_str().unwrap(); + let parse_append = FileId::parse_file_name(path); + let parse_reserved = parse_reserved_file_name(path); + match (parse_append, parse_reserved) { + (Some(id), None) if id.queue == LogQueue::Append => { + if delete { + self.append_metadata.lock().unwrap().remove(&id.seq) + } else { + self.append_metadata.lock().unwrap().insert(id.seq) + } + } + (None, Some(seq)) => { + if delete { + self.reserved_metadata.lock().unwrap().remove(&seq) + } else { + self.reserved_metadata.lock().unwrap().insert(seq) + } + } + _ => false, + } + } + } + + impl FileSystem for DeleteMonitoredFileSystem { + type Handle = ::Handle; + type Reader = ::Reader; + type Writer = ::Writer; + + fn create>(&self, path: P) -> std::io::Result { + let handle = self.inner.create(&path)?; + self.update_metadata(path.as_ref(), false); + Ok(handle) + } + + fn open>(&self, path: P, perm: Permission) -> std::io::Result { + let handle = self.inner.open(&path, perm)?; + self.update_metadata(path.as_ref(), false); + Ok(handle) + } + + fn delete>(&self, path: P) -> std::io::Result<()> { + self.inner.delete(&path)?; + self.update_metadata(path.as_ref(), true); + Ok(()) + } + + fn rename>(&self, src_path: P, dst_path: P) -> std::io::Result<()> { + self.inner.rename(src_path.as_ref(), dst_path.as_ref())?; + self.update_metadata(src_path.as_ref(), true); + self.update_metadata(dst_path.as_ref(), false); + Ok(()) + } + + fn reuse>(&self, src_path: P, dst_path: P) -> std::io::Result<()> { + self.inner.reuse(src_path.as_ref(), dst_path.as_ref())?; + self.update_metadata(src_path.as_ref(), true); + self.update_metadata(dst_path.as_ref(), false); + Ok(()) + } + + fn delete_metadata>(&self, path: P) -> std::io::Result<()> { + self.inner.delete_metadata(&path)?; + self.update_metadata(path.as_ref(), true); + Ok(()) + } + + fn exists_metadata>(&self, path: P) -> bool { + if self.inner.exists_metadata(&path) { + return true; + } + let path = path.as_ref().file_name().unwrap().to_str().unwrap(); + let parse_append = FileId::parse_file_name(path); + let parse_reserved = parse_reserved_file_name(path); + match (parse_append, parse_reserved) { + (Some(id), None) if id.queue == LogQueue::Append => { + self.append_metadata.lock().unwrap().contains(&id.seq) + } + (None, Some(seq)) => self.reserved_metadata.lock().unwrap().contains(&seq), + _ => false, + } + } + + fn new_reader(&self, h: Arc) -> std::io::Result { + self.inner.new_reader(h) + } + + fn new_writer(&self, h: Arc) -> std::io::Result { + self.inner.new_writer(h) + } + } + + #[test] + fn test_managed_file_deletion() { + let dir = tempfile::Builder::new() + .prefix("test_managed_file_deletion") + .tempdir() + .unwrap(); + let entry_data = vec![b'x'; 128]; + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(1), + purge_threshold: ReadableSize(1), + enable_log_recycle: false, + ..Default::default() + }; + let fs = Arc::new(DeleteMonitoredFileSystem::new()); + let engine = RaftLogEngine::open_with_file_system(cfg, fs.clone()).unwrap(); + for rid in 1..=10 { + engine.append(rid, 1, 11, Some(&entry_data)); + } + for rid in 1..=5 { + engine.clean(rid); + } + let (start, _) = engine.file_span(LogQueue::Append); + engine.purge_expired_files().unwrap(); + // some active files have been deleted. + assert!(start < engine.file_span(LogQueue::Append).0); + // corresponding physical files have been deleted too. + assert_eq!(engine.file_count(None), fs.inner.file_count()); + let start = engine.file_span(LogQueue::Append).0; + // metadata have been deleted. + assert_eq!( + fs.append_metadata.lock().unwrap().iter().next().unwrap(), + &start + ); + + let engine = engine.reopen(); + assert_eq!(engine.file_count(None), fs.inner.file_count()); + let (start, _) = engine.file_span(LogQueue::Append); + assert_eq!( + fs.append_metadata.lock().unwrap().iter().next().unwrap(), + &start + ); + + // Simulate stale metadata. + for i in start / 2..start { + fs.append_metadata.lock().unwrap().insert(i); + } + let engine = engine.reopen(); + let (start, _) = engine.file_span(LogQueue::Append); + assert_eq!( + fs.append_metadata.lock().unwrap().iter().next().unwrap(), + &start + ); + } + + #[test] + fn test_managed_file_reuse() { + let dir = tempfile::Builder::new() + .prefix("test_managed_file_reuse") + .tempdir() + .unwrap(); + let entry_data = vec![b'x'; 16]; + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(1), + purge_threshold: ReadableSize(50), + format_version: Version::V2, + enable_log_recycle: true, + prefill_for_recycle: true, + ..Default::default() + }; + let recycle_capacity = cfg.recycle_capacity() as u64; + let fs = Arc::new(DeleteMonitoredFileSystem::new()); + let engine = RaftLogEngine::open_with_file_system(cfg, fs.clone()).unwrap(); + + let reserved_start = *fs.reserved_metadata.lock().unwrap().first().unwrap(); + for rid in 1..=10 { + engine.append(rid, 1, 11, Some(&entry_data)); + } + for rid in 1..=10 { + engine.clean(rid); + } + // Purge all files. + engine.purge_manager.must_rewrite_append_queue(None, None); + assert_eq!(engine.file_count(Some(LogQueue::Append)), 1); + // Reserved files have been reused. + let reserved_start_1 = *fs.reserved_metadata.lock().unwrap().first().unwrap(); + assert!(reserved_start < reserved_start_1); + // Reuse more. + for rid in 1..=5 { + engine.append(rid, 1, 11, Some(&entry_data)); + } + let reserved_start_2 = *fs.reserved_metadata.lock().unwrap().first().unwrap(); + assert!(reserved_start_1 < reserved_start_2); + + let file_count = fs.inner.file_count(); + let start_1 = *fs.append_metadata.lock().unwrap().first().unwrap(); + let engine = engine.reopen(); + // Recycled files are reserved, but stale append files are renamed. The total + // count should stay unchanged. + assert_eq!(file_count, fs.inner.file_count()); + let start_2 = *fs.append_metadata.lock().unwrap().first().unwrap(); + assert!(start_1 < start_2); + let reserved_start_3 = *fs.reserved_metadata.lock().unwrap().first().unwrap(); + assert_eq!(reserved_start_2, reserved_start_3); + + // Reuse all of reserved files. + for rid in 1..=recycle_capacity { + engine.append(rid, 1, 11, Some(&entry_data)); + } + assert!(fs.reserved_metadata.lock().unwrap().is_empty()); + for rid in 1..=recycle_capacity { + engine.clean(rid); + } + engine.purge_manager.must_rewrite_append_queue(None, None); + // Then reuse a recycled append file. + engine.append(1, 1, 11, Some(&entry_data)); + assert_eq!(engine.file_count(Some(LogQueue::Append)), 2); + let start_3 = *fs.append_metadata.lock().unwrap().first().unwrap(); + assert!(start_2 < start_3); + } + + #[test] + fn test_simple_write_perf_context() { + let dir = tempfile::Builder::new() + .prefix("test_simple_write_perf_context") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + ..Default::default() + }; + let rid = 1; + let entry_size = 5120; + let engine = RaftLogEngine::open(cfg).unwrap(); + let data = vec![b'x'; entry_size]; + let old_perf_context = get_perf_context(); + engine.append(rid, 1, 5, Some(&data)); + let new_perf_context = get_perf_context(); + assert_ne!( + old_perf_context.log_populating_duration, + new_perf_context.log_populating_duration + ); + assert_ne!( + old_perf_context.log_write_duration, + new_perf_context.log_write_duration + ); + assert_ne!( + old_perf_context.apply_duration, + new_perf_context.apply_duration + ); + } + + #[test] + fn test_recycle_no_signing_files() { + let dir = tempfile::Builder::new() + .prefix("test_recycle_no_signing_files") + .tempdir() + .unwrap(); + let entry_data = vec![b'x'; 128]; + let fs = Arc::new(DeleteMonitoredFileSystem::new()); + let cfg_v1 = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(1), + purge_threshold: ReadableSize(1024), + format_version: Version::V1, + enable_log_recycle: false, + ..Default::default() + }; + let cfg_v2 = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(1), + purge_threshold: ReadableSize(15), + format_version: Version::V2, + enable_log_recycle: true, + prefill_for_recycle: false, + ..Default::default() + }; + assert!(cfg_v2.recycle_capacity() > 0); + // Prepare files with format_version V1 + { + let engine = RaftLogEngine::open_with_file_system(cfg_v1.clone(), fs.clone()).unwrap(); + for rid in 1..=10 { + engine.append(rid, 1, 11, Some(&entry_data)); + } + } + // Reopen the Engine with V2 and purge + { + let engine = RaftLogEngine::open_with_file_system(cfg_v2.clone(), fs.clone()).unwrap(); + let (start, _) = engine.file_span(LogQueue::Append); + for rid in 6..=10 { + engine.append(rid, 11, 20, Some(&entry_data)); + } + // Mark region_id -> 6 obsolete. + engine.clean(6); + // the [1, 12] files are recycled + engine.purge_expired_files().unwrap(); + assert_eq!(engine.file_count(Some(LogQueue::Append)), 5); + assert!(start < engine.file_span(LogQueue::Append).0); + } + // Reopen the Engine with V1 -> V2 and purge + { + let engine = RaftLogEngine::open_with_file_system(cfg_v1, fs.clone()).unwrap(); + let (start, _) = engine.file_span(LogQueue::Append); + for rid in 6..=10 { + engine.append(rid, 20, 30, Some(&entry_data)); + } + for rid in 6..=10 { + engine.append(rid, 30, 40, Some(&entry_data)); + } + for rid in 1..=5 { + engine.append(rid, 11, 20, Some(&entry_data)); + } + assert_eq!(engine.file_span(LogQueue::Append).0, start); + let file_count = engine.file_count(Some(LogQueue::Append)); + drop(engine); + let engine = RaftLogEngine::open_with_file_system(cfg_v2, fs).unwrap(); + assert_eq!(engine.file_span(LogQueue::Append).0, start); + assert_eq!(engine.file_count(Some(LogQueue::Append)), file_count); + // Mark all regions obsolete. + for rid in 1..=10 { + engine.clean(rid); + } + let (start, _) = engine.file_span(LogQueue::Append); + // the [13, 32] files are purged + engine.purge_expired_files().unwrap(); + assert_eq!(engine.file_count(Some(LogQueue::Append)), 1); + assert!(engine.file_span(LogQueue::Append).0 > start); + } + } + + #[test] + fn test_start_engine_with_resize_recycle_capacity() { + let dir = tempfile::Builder::new() + .prefix("test_start_engine_with_resize_recycle_capacity") + .tempdir() + .unwrap(); + let path = dir.path().to_str().unwrap(); + let file_system = Arc::new(DeleteMonitoredFileSystem::new()); + let entry_data = vec![b'x'; 512]; + + // Case 1: start an engine with no-recycle. + let cfg = Config { + dir: path.to_owned(), + enable_log_recycle: false, + ..Default::default() + }; + let engine = RaftLogEngine::open_with_file_system(cfg, file_system.clone()).unwrap(); + let (start, _) = engine.file_span(LogQueue::Append); + // Only one valid file left, the last one => active_file. + assert_eq!(engine.file_count(Some(LogQueue::Append)), 1); + assert_eq!(file_system.inner.file_count(), engine.file_count(None)); + // Append data. + for rid in 1..=5 { + engine.append(rid, 1, 10, Some(&entry_data)); + } + assert_eq!(engine.file_span(LogQueue::Append).0, start); + assert_eq!(file_system.inner.file_count(), engine.file_count(None)); + drop(engine); + + // Case 2: restart the engine with a common size of recycling capacity. + let cfg = Config { + dir: path.to_owned(), + target_file_size: ReadableSize(1), + purge_threshold: ReadableSize(80), // common size of capacity + enable_log_recycle: true, + prefill_for_recycle: true, + ..Default::default() + }; + let engine = + RaftLogEngine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + let (start, end) = engine.file_span(LogQueue::Append); + // Only one valid file left, the last one => active_file. + assert_eq!(start, end); + let recycled_count = file_system.inner.file_count() - engine.file_count(None); + assert!(recycled_count > 0); + // Append data. Several recycled files have been reused. + for rid in 1..=5 { + engine.append(rid, 10, 20, Some(&entry_data)); + } + assert_eq!(engine.file_span(LogQueue::Append).0, start); + assert!(recycled_count > file_system.inner.file_count() - engine.file_count(None)); + let (start, end) = engine.file_span(LogQueue::Append); + let recycled_count = file_system.inner.file_count() - engine.file_count(None); + drop(engine); + + // Case 3: restart the engine with a smaller capacity. Redundant recycled files + // will be cleared. + let cfg_v2 = Config { + target_file_size: ReadableSize(1), + purge_threshold: ReadableSize(50), + ..cfg + }; + let engine = + RaftLogEngine::open_with_file_system(cfg_v2.clone(), file_system.clone()).unwrap(); + assert_eq!(engine.file_span(LogQueue::Append), (start, end)); + assert!(recycled_count > file_system.inner.file_count() - engine.file_count(None)); + // Recycled files have filled the LogQueue::Append, purge_expired_files won't + // truely remove files from it. + engine.purge_expired_files().unwrap(); + assert_eq!(engine.file_span(LogQueue::Append), (start, end)); + for rid in 1..=10 { + engine.append(rid, 20, 31, Some(&entry_data)); + } + assert!(engine.file_span(LogQueue::Append).1 > end); + let engine = engine.reopen(); + assert!(recycled_count > file_system.inner.file_count() - engine.file_count(None)); + drop(engine); + + // Case 4: restart the engine without log recycling. Recycled logs should be + // cleared. + let cfg_v3 = Config { + target_file_size: ReadableSize::kb(2), + purge_threshold: ReadableSize::kb(100), + enable_log_recycle: false, + prefill_for_recycle: false, + ..cfg_v2 + }; + let engine = RaftLogEngine::open_with_file_system(cfg_v3, file_system.clone()).unwrap(); + assert_eq!(file_system.inner.file_count(), engine.file_count(None)); + } + + #[test] + fn test_rewrite_atomic_group() { + let dir = tempfile::Builder::new() + .prefix("test_rewrite_atomic_group") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + // Make sure each file gets replayed individually. + recovery_threads: 100, + target_file_size: ReadableSize(1), + ..Default::default() + }; + let fs = Arc::new(ObfuscatedFileSystem::default()); + let key = vec![b'x'; 2]; + let value = vec![b'y'; 8]; + + let engine = RaftLogEngine::open_with_file_system(cfg, fs).unwrap(); + let mut data = HashSet::new(); + let mut rid = 1; + // Directly write to pipe log. + let mut log_batch = LogBatch::default(); + let flush = |lb: &mut LogBatch| { + lb.finish_populate(0, None).unwrap(); + engine.pipe_log.append(LogQueue::Rewrite, lb).unwrap(); + lb.drain(); + }; + { + // begin. + let mut builder = AtomicGroupBuilder::with_id(3); + builder.begin(&mut log_batch); + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + flush(&mut log_batch); + engine.pipe_log.rotate(LogQueue::Rewrite).unwrap(); + } + { + // begin - unrelated - end. + let mut builder = AtomicGroupBuilder::with_id(3); + builder.begin(&mut log_batch); + rid += 1; + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + data.insert(rid); + flush(&mut log_batch); + // plug a unrelated write. + rid += 1; + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + data.insert(rid); + flush(&mut log_batch); + builder.end(&mut log_batch); + rid += 1; + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + data.insert(rid); + flush(&mut log_batch); + engine.pipe_log.rotate(LogQueue::Rewrite).unwrap(); + } + { + // begin - middle - middle - end. + let mut builder = AtomicGroupBuilder::with_id(3); + builder.begin(&mut log_batch); + rid += 1; + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + data.insert(rid); + flush(&mut log_batch); + builder.add(&mut log_batch); + rid += 1; + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + data.insert(rid); + flush(&mut log_batch); + builder.add(&mut log_batch); + rid += 1; + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + data.insert(rid); + flush(&mut log_batch); + builder.end(&mut log_batch); + rid += 1; + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + data.insert(rid); + flush(&mut log_batch); + engine.pipe_log.rotate(LogQueue::Rewrite).unwrap(); + } + { + // begin - begin - end. + let mut builder = AtomicGroupBuilder::with_id(3); + builder.begin(&mut log_batch); + rid += 1; + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + flush(&mut log_batch); + let mut builder = AtomicGroupBuilder::with_id(3); + builder.begin(&mut log_batch); + rid += 1; + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + data.insert(rid); + flush(&mut log_batch); + builder.end(&mut log_batch); + rid += 1; + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + data.insert(rid); + flush(&mut log_batch); + engine.pipe_log.rotate(LogQueue::Rewrite).unwrap(); + } + { + // end - middle - end. + // We must change id to avoid getting merged with last group. + // It is actually not possible in real life to only have "begin" missing. + let mut builder = AtomicGroupBuilder::with_id(4); + builder.begin(&mut LogBatch::default()); + builder.end(&mut log_batch); + rid += 1; + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + flush(&mut log_batch); + let mut builder = AtomicGroupBuilder::with_id(4); + builder.begin(&mut LogBatch::default()); + builder.add(&mut log_batch); + rid += 1; + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + flush(&mut log_batch); + builder.end(&mut log_batch); + rid += 1; + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + flush(&mut log_batch); + engine.pipe_log.rotate(LogQueue::Rewrite).unwrap(); + } + { + // end - begin - end + let mut builder = AtomicGroupBuilder::with_id(5); + builder.begin(&mut LogBatch::default()); + builder.end(&mut log_batch); + rid += 1; + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + flush(&mut log_batch); + let mut builder = AtomicGroupBuilder::with_id(5); + builder.begin(&mut log_batch); + rid += 1; + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + data.insert(rid); + flush(&mut log_batch); + builder.end(&mut log_batch); + rid += 1; + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + data.insert(rid); + flush(&mut log_batch); + engine.pipe_log.rotate(LogQueue::Rewrite).unwrap(); + } + { + // begin - end - begin - end. + let mut builder = AtomicGroupBuilder::with_id(6); + builder.begin(&mut log_batch); + rid += 1; + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + data.insert(rid); + flush(&mut log_batch); + builder.end(&mut log_batch); + flush(&mut log_batch); + let mut builder = AtomicGroupBuilder::with_id(7); + builder.begin(&mut log_batch); + flush(&mut log_batch); + builder.end(&mut log_batch); + rid += 1; + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + data.insert(rid); + flush(&mut log_batch); + engine.pipe_log.rotate(LogQueue::Rewrite).unwrap(); + } + engine.pipe_log.sync(LogQueue::Rewrite).unwrap(); + + let engine = engine.reopen(); + for rid in engine.raft_groups() { + assert!(data.remove(&rid), "{}", rid); + assert_eq!(engine.get(rid, &key).unwrap(), value); + } + assert!(data.is_empty(), "data loss {:?}", data); + } + + #[test] + fn test_internal_key_filter() { + let dir = tempfile::Builder::new() + .prefix("test_internal_key_filter") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + ..Default::default() + }; + let fs = Arc::new(ObfuscatedFileSystem::default()); + let engine = RaftLogEngine::open_with_file_system(cfg, fs).unwrap(); + let value = vec![b'y'; 8]; + let mut log_batch = LogBatch::default(); + log_batch.put_unchecked(1, crate::make_internal_key(&[1]), value.clone()); + log_batch.put_unchecked(2, crate::make_internal_key(&[1]), value.clone()); + engine.write(&mut log_batch, false).unwrap(); + // Apply of append filtered. + assert!(engine.raft_groups().is_empty()); + + let engine = engine.reopen(); + // Replay of append filtered. + assert!(engine.raft_groups().is_empty()); + + log_batch.put_unchecked(3, crate::make_internal_key(&[1]), value.clone()); + log_batch.put_unchecked(4, crate::make_internal_key(&[1]), value); + log_batch.finish_populate(0, None).unwrap(); + let block_handle = engine + .pipe_log + .append(LogQueue::Rewrite, &mut log_batch) + .unwrap(); + log_batch.finish_write(block_handle); + engine + .memtables + .apply_rewrite_writes(log_batch.drain(), None, 0); + // Apply of rewrite filtered. + assert!(engine.raft_groups().is_empty()); + + let engine = engine.reopen(); + // Replay of rewrite filtered. + assert!(engine.raft_groups().is_empty()); + } + + #[test] + fn test_start_engine_with_multi_dirs() { + let dir = tempfile::Builder::new() + .prefix("test_start_engine_with_multi_dirs_default") + .tempdir() + .unwrap(); + let spill_dir = tempfile::Builder::new() + .prefix("test_start_engine_with_multi_dirs_spill") + .tempdir() + .unwrap(); + fn number_of_files(p: &Path) -> usize { + let mut r = 0; + std::fs::read_dir(p).unwrap().for_each(|e| { + if e.unwrap() + .path() + .file_name() + .unwrap() + .to_str() + .unwrap() + .starts_with("000") + { + r += 1; + } + }); + r + } + let file_system = Arc::new(DeleteMonitoredFileSystem::new()); + let entry_data = vec![b'x'; 512]; + + // Preparations for multi-dirs. + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + spill_dir: Some(spill_dir.path().to_str().unwrap().to_owned()), + enable_log_recycle: false, + target_file_size: ReadableSize(1), + ..Default::default() + }; + { + // Step 1: write data into the main directory. + let engine = + RaftLogEngine::open_with_file_system(cfg.clone(), file_system.clone()).unwrap(); + for rid in 1..=10 { + engine.append(rid, 1, 10, Some(&entry_data)); + } + drop(engine); + + // Step 2: select several log files and move them into the `spill_dir` + // directory. + let mut moved = 0; + for e in std::fs::read_dir(dir.path()).unwrap() { + let p = e.unwrap().path(); + let file_name = p.file_name().unwrap().to_str().unwrap(); + if let Some(FileId { + queue: LogQueue::Append, + seq: _, + }) = FileId::parse_file_name(file_name) + { + file_system + .rename(&p, &spill_dir.path().join(file_name)) + .unwrap(); + moved += 1; + if moved == 4 { + break; + } + } + } + } + + // Restart the engine with recycle and prefill. Test reusing files from both + // dirs. + let cfg_2 = Config { + enable_log_recycle: true, + prefill_for_recycle: true, + purge_threshold: ReadableSize(40), + ..cfg.clone() + }; + let recycle_capacity = cfg_2.recycle_capacity() as u64; + let engine = RaftLogEngine::open_with_file_system(cfg_2, file_system.clone()).unwrap(); + assert!(number_of_files(spill_dir.path()) > 0); + for rid in 1..=10 { + assert_eq!(engine.first_index(rid).unwrap(), 1); + engine.clean(rid); + } + engine.purge_manager.must_rewrite_append_queue(None, None); + let file_count = file_system.inner.file_count(); + assert_eq!( + number_of_files(spill_dir.path()) + number_of_files(dir.path()), + file_count + ); + assert!(file_count > engine.file_count(None)); + // Append data, recycled files are reused. + for rid in 1..=recycle_capacity - 10 { + engine.append(rid, 20, 30, Some(&entry_data)); + } + // No new file is created. + assert_eq!(file_count, file_system.inner.file_count()); + assert!(number_of_files(spill_dir.path()) > 0); + + let cfg_3 = Config { + enable_log_recycle: false, + purge_threshold: ReadableSize(40), + ..cfg + }; + drop(engine); + let engine = RaftLogEngine::open_with_file_system(cfg_3, file_system).unwrap(); + assert!(number_of_files(spill_dir.path()) > 0); + for rid in 1..=10 { + assert_eq!(engine.first_index(rid).unwrap(), 20); + } + + // abnormal case - duplicate FileSeq among different dirs. + { + // Prerequisite: choose several files and duplicate them to main dir. + let mut file_count = 0; + for e in std::fs::read_dir(spill_dir.path()).unwrap() { + let p = e.unwrap().path(); + let file_name = p.file_name().unwrap().to_str().unwrap(); + if let Some(FileId { + queue: LogQueue::Append, + seq: _, + }) = FileId::parse_file_name(file_name) + { + if file_count % 2 == 0 { + std::fs::copy(&p, dir.path().join(file_name)).unwrap(); + } + file_count += 1; + } + } + } + let start = engine.file_span(LogQueue::Append).0; + let engine = engine.reopen(); + // Duplicate log files will be skipped and cleared. + assert!(engine.file_span(LogQueue::Append).0 > start); + } +} diff --git a/third/raft-engine/src/env/default.rs b/third/raft-engine/src/env/default.rs new file mode 100644 index 00000000..44f4fa18 --- /dev/null +++ b/third/raft-engine/src/env/default.rs @@ -0,0 +1,135 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +#[cfg(feature = "failpoints")] +use std::io::{Error, ErrorKind}; +use std::io::{Read, Result as IoResult, Seek, SeekFrom, Write}; +use std::path::Path; +use std::sync::Arc; + +use fail::fail_point; + +use crate::env::log_fd::LogFd; +use crate::env::{FileSystem, Handle, Permission, WriteExt}; + +/// A low-level file adapted for standard interfaces including [`Seek`], +/// [`Write`] and [`Read`]. +pub struct LogFile { + inner: Arc, + offset: usize, +} + +impl LogFile { + /// Creates a new [`LogFile`] from a shared [`LogFd`]. + pub fn new(fd: Arc) -> Self { + Self { + inner: fd, + offset: 0, + } + } +} + +impl Write for LogFile { + fn write(&mut self, buf: &[u8]) -> IoResult { + fail_point!("log_file::write::zero", |_| { Ok(0) }); + + let len = self.inner.write(self.offset, buf)?; + + fail_point!("log_file::write::err", |_| { + Err(Error::new(ErrorKind::InvalidInput, "fp")) + }); + + self.offset += len; + Ok(len) + } + + fn flush(&mut self) -> IoResult<()> { + Ok(()) + } +} + +impl Read for LogFile { + fn read(&mut self, buf: &mut [u8]) -> IoResult { + fail_point!("log_file::read::err", |_| { + Err(Error::new(ErrorKind::InvalidInput, "fp")) + }); + + let len = self.inner.read(self.offset, buf)?; + self.offset += len; + Ok(len) + } +} + +impl Seek for LogFile { + fn seek(&mut self, pos: SeekFrom) -> IoResult { + fail_point!("log_file::seek::err", |_| { + Err(std::io::Error::new(std::io::ErrorKind::Other, "fp")) + }); + match pos { + SeekFrom::Start(offset) => self.offset = offset as usize, + SeekFrom::Current(i) => self.offset = (self.offset as i64 + i) as usize, + SeekFrom::End(i) => self.offset = (self.inner.file_size()? as i64 + i) as usize, + } + Ok(self.offset as u64) + } +} + +impl WriteExt for LogFile { + fn truncate(&mut self, offset: usize) -> IoResult<()> { + fail_point!("log_file::truncate::err", |_| { + Err(Error::new(ErrorKind::InvalidInput, "fp")) + }); + + self.inner.truncate(offset)?; + self.offset = offset; + Ok(()) + } + + fn allocate(&mut self, offset: usize, size: usize) -> IoResult<()> { + fail_point!("log_file::allocate::err", |_| { + Err(Error::new(ErrorKind::InvalidInput, "fp")) + }); + + self.inner.allocate(offset, size) + } +} + +pub struct DefaultFileSystem; + +impl FileSystem for DefaultFileSystem { + type Handle = LogFd; + type Reader = LogFile; + type Writer = LogFile; + + fn create>(&self, path: P) -> IoResult { + fail_point!("default_fs::create::err", |_| { + Err(Error::new(ErrorKind::InvalidInput, "fp")) + }); + + LogFd::create(path.as_ref()) + } + + fn open>(&self, path: P, perm: Permission) -> IoResult { + fail_point!("default_fs::open::err", |_| { + Err(Error::new(ErrorKind::InvalidInput, "fp")) + }); + + LogFd::open(path.as_ref(), perm) + } + + fn delete>(&self, path: P) -> IoResult<()> { + fail_point!("default_fs::delete_skipped", |_| { Ok(()) }); + std::fs::remove_file(path) + } + + fn rename>(&self, src_path: P, dst_path: P) -> IoResult<()> { + std::fs::rename(src_path, dst_path) + } + + fn new_reader(&self, handle: Arc) -> IoResult { + Ok(LogFile::new(handle)) + } + + fn new_writer(&self, handle: Arc) -> IoResult { + Ok(LogFile::new(handle)) + } +} diff --git a/third/raft-engine/src/env/log_fd.rs b/third/raft-engine/src/env/log_fd.rs new file mode 100644 index 00000000..23cc2b3f --- /dev/null +++ b/third/raft-engine/src/env/log_fd.rs @@ -0,0 +1,11 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +#[cfg(not(any(windows, feature = "std_fs")))] +mod unix; +#[cfg(not(any(windows, feature = "std_fs")))] +pub use unix::LogFd; + +#[cfg(any(windows, feature = "std_fs"))] +mod plain; +#[cfg(any(windows, feature = "std_fs"))] +pub use plain::LogFd; diff --git a/third/raft-engine/src/env/log_fd/plain.rs b/third/raft-engine/src/env/log_fd/plain.rs new file mode 100644 index 00000000..03328e91 --- /dev/null +++ b/third/raft-engine/src/env/log_fd/plain.rs @@ -0,0 +1,84 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +//! A naive file handle implementation based on standard `File`. All I/O +//! operations need to synchronize under a `RwLock`. + +use crate::env::{Handle, Permission}; + +use fail::fail_point; +use parking_lot::RwLock; + +use std::fs::{File, OpenOptions}; +use std::io::{Error, ErrorKind, Read, Result, Seek, SeekFrom, Write}; +use std::path::Path; +use std::sync::Arc; + +pub struct LogFd(Arc>); + +impl LogFd { + pub fn open>(path: P, _: Permission) -> Result { + OpenOptions::new() + .read(true) + .write(true) + .open(path) + .map(|x| Self(Arc::new(RwLock::new(x)))) + } + + pub fn create>(path: P) -> Result { + OpenOptions::new() + .create(true) + .read(true) + .write(true) + .open(path) + .map(|x| Self(Arc::new(RwLock::new(x)))) + } + + pub fn read(&self, offset: usize, buf: &mut [u8]) -> Result { + let mut file = self.0.write(); + let _ = file.seek(SeekFrom::Start(offset as u64))?; + file.read(buf) + } + + pub fn write(&self, offset: usize, content: &[u8]) -> Result { + fail_point!("log_fd::write::no_space_err", |_| { + Err(Error::new(ErrorKind::Other, "nospace")) + }); + + let mut file = self.0.write(); + let _ = file.seek(SeekFrom::Start(offset as u64))?; + file.write(content) + } + + pub fn truncate(&self, offset: usize) -> Result<()> { + let file = self.0.write(); + file.set_len(offset as u64) + } + + pub fn allocate(&self, _offset: usize, _size: usize) -> Result<()> { + Ok(()) + } +} + +impl Handle for LogFd { + fn truncate(&self, offset: usize) -> Result<()> { + self.truncate(offset) + } + + fn file_size(&self) -> Result { + fail_point!("log_fd::file_size::err", |_| { + Err(Error::new(ErrorKind::InvalidInput, "fp")) + }); + + let file = self.0.read(); + file.metadata().map(|x| x.len() as usize) + } + + fn sync(&self) -> Result<()> { + fail_point!("log_fd::sync::err", |_| { + Err(Error::new(ErrorKind::InvalidInput, "fp")) + }); + + let file = self.0.write(); + file.sync_all() + } +} diff --git a/third/raft-engine/src/env/log_fd/unix.rs b/third/raft-engine/src/env/log_fd/unix.rs new file mode 100644 index 00000000..030c3fed --- /dev/null +++ b/third/raft-engine/src/env/log_fd/unix.rs @@ -0,0 +1,211 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use std::io::Result as IoResult; +use std::os::unix::io::RawFd; + +use fail::fail_point; +use log::error; +use nix::errno::Errno; +use nix::fcntl::{self, OFlag}; +use nix::sys::stat::Mode; +use nix::sys::uio::{pread, pwrite}; +use nix::unistd::{close, ftruncate, lseek, Whence}; +use nix::NixPath; + +use crate::env::{Handle, Permission}; + +fn from_nix_error(e: nix::Error, custom: &'static str) -> std::io::Error { + let kind = std::io::Error::from(e).kind(); + std::io::Error::new(kind, custom) +} + +impl From for OFlag { + fn from(value: Permission) -> OFlag { + match value { + Permission::ReadOnly => OFlag::O_RDONLY, + Permission::ReadWrite => OFlag::O_RDWR, + } + } +} + +/// A RAII-style low-level file. Errors occurred during automatic resource +/// release are logged and ignored. +/// +/// A [`LogFd`] is essentially a thin wrapper around [`RawFd`]. It's only +/// supported on *Unix*, and primarily optimized for *Linux*. +/// +/// All [`LogFd`] instances are opened with read and write permission. +pub struct LogFd(RawFd); + +impl LogFd { + /// Opens a file with the given `path`. + pub fn open(path: &P, perm: Permission) -> IoResult { + // Permission 644 + let mode = Mode::S_IRUSR | Mode::S_IWUSR | Mode::S_IRGRP | Mode::S_IROTH; + fail_point!("log_fd::open::fadvise_dontneed", |_| { + let fd = + LogFd(fcntl::open(path, perm.into(), mode).map_err(|e| from_nix_error(e, "open"))?); + #[cfg(target_os = "linux")] + unsafe { + extern crate libc; + libc::posix_fadvise64(fd.0, 0, fd.file_size()? as i64, libc::POSIX_FADV_DONTNEED); + } + Ok(fd) + }); + Ok(LogFd( + fcntl::open(path, perm.into(), mode).map_err(|e| from_nix_error(e, "open"))?, + )) + } + + /// Opens a file with the given `path`. The specified file will be created + /// first if not exists. + pub fn create(path: &P) -> IoResult { + let flags = OFlag::O_RDWR | OFlag::O_CREAT; + // Permission 644 + let mode = Mode::S_IRUSR | Mode::S_IWUSR | Mode::S_IRGRP | Mode::S_IROTH; + let fd = fcntl::open(path, flags, mode).map_err(|e| from_nix_error(e, "open"))?; + Ok(LogFd(fd)) + } + + /// Closes the file. + pub fn close(&self) -> IoResult<()> { + fail_point!("log_fd::close::err", |_| { + Err(from_nix_error(nix::Error::EINVAL, "fp")) + }); + close(self.0).map_err(|e| from_nix_error(e, "close")) + } + + /// Reads some bytes starting at `offset` from this file into the specified + /// buffer. Returns how many bytes were read. + pub fn read(&self, mut offset: usize, buf: &mut [u8]) -> IoResult { + let mut readed = 0; + while readed < buf.len() { + let bytes = match pread(self.0, &mut buf[readed..], offset as i64) { + Ok(bytes) => bytes, + Err(Errno::EINTR) => continue, + Err(e) => return Err(from_nix_error(e, "pread")), + }; + // EOF + if bytes == 0 { + break; + } + readed += bytes; + offset += bytes; + } + Ok(readed) + } + + /// Writes some bytes to this file starting at `offset`. Returns how many + /// bytes were written. + pub fn write(&self, mut offset: usize, content: &[u8]) -> IoResult { + fail_point!("log_fd::write::no_space_err", |_| { + Err(from_nix_error(nix::Error::ENOSPC, "nospace")) + }); + let mut written = 0; + while written < content.len() { + let bytes = match pwrite(self.0, &content[written..], offset as i64) { + Ok(bytes) => bytes, + Err(Errno::EINTR) => continue, + Err(e) if e == Errno::ENOSPC => return Err(from_nix_error(e, "nospace")), + Err(e) => return Err(from_nix_error(e, "pwrite")), + }; + if bytes == 0 { + break; + } + written += bytes; + offset += bytes; + } + Ok(written) + } + + /// Truncates all data after `offset`. + pub fn truncate(&self, offset: usize) -> IoResult<()> { + ftruncate(self.0, offset as i64).map_err(|e| from_nix_error(e, "ftruncate")) + } + + /// Attempts to allocate space for `size` bytes starting at `offset`. + #[allow(unused_variables)] + pub fn allocate(&self, offset: usize, size: usize) -> IoResult<()> { + #[cfg(target_os = "linux")] + { + if let Err(e) = fcntl::fallocate( + self.0, + fcntl::FallocateFlags::empty(), + offset as i64, + size as i64, + ) { + if e != nix::Error::EOPNOTSUPP { + return Err(from_nix_error(e, "fallocate")); + } + } + } + Ok(()) + } +} + +impl Handle for LogFd { + #[inline] + fn truncate(&self, offset: usize) -> IoResult<()> { + self.truncate(offset) + } + + #[inline] + fn file_size(&self) -> IoResult { + fail_point!("log_fd::file_size::err", |_| { + Err(from_nix_error(nix::Error::EINVAL, "fp")) + }); + lseek(self.0, 0, Whence::SeekEnd) + .map(|n| n as usize) + .map_err(|e| from_nix_error(e, "lseek")) + } + + #[inline] + fn sync(&self) -> IoResult<()> { + fail_point!("log_fd::sync::err", |_| { + Err(from_nix_error(nix::Error::EINVAL, "fp")) + }); + #[cfg(target_os = "linux")] + { + nix::unistd::fdatasync(self.0).map_err(|e| from_nix_error(e, "fdatasync")) + } + #[cfg(not(target_os = "linux"))] + { + nix::unistd::fsync(self.0).map_err(|e| from_nix_error(e, "fsync")) + } + } + + #[inline] + fn sync_range(&self, offset: usize, nbytes: usize) -> IoResult<()> { + sync_file_range(self.0, offset, nbytes, false) + } +} + +impl Drop for LogFd { + fn drop(&mut self) { + if let Err(e) = self.close() { + error!("error while closing file: {e}"); + } + } +} + +#[inline] +pub(crate) fn sync_file_range(fd: RawFd, offset: usize, nbytes: usize, wait: bool) -> IoResult<()> { + #[cfg(target_os = "linux")] + unsafe { + use libc::{sync_file_range, SYNC_FILE_RANGE_WAIT_AFTER, SYNC_FILE_RANGE_WRITE}; + + let flags = if wait { + SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER + } else { + SYNC_FILE_RANGE_WRITE + }; + if sync_file_range(fd, offset as i64, nbytes as i64, flags) != 0 { + return Err(std::io::Error::last_os_error()); + } + } + #[cfg(not(target_os = "linux"))] + { + let _ = (offset, nbytes); + } + Ok(()) +} diff --git a/third/raft-engine/src/env/mod.rs b/third/raft-engine/src/env/mod.rs new file mode 100644 index 00000000..1e3c699b --- /dev/null +++ b/third/raft-engine/src/env/mod.rs @@ -0,0 +1,96 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use std::io::{Read, Result, Seek, Write}; +use std::path::Path; +use std::sync::Arc; + +mod default; +mod log_fd; +mod obfuscated; + +pub use default::DefaultFileSystem; +pub use obfuscated::ObfuscatedFileSystem; + +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum Permission { + ReadOnly, + ReadWrite, +} + +/// FileSystem +pub trait FileSystem: Send + Sync { + type Handle: Send + Sync + Handle; + type Reader: Seek + Read + Send; + type Writer: Seek + Write + Send + WriteExt; + + fn create>(&self, path: P) -> Result; + + fn open>(&self, path: P, perm: Permission) -> Result; + + fn delete>(&self, path: P) -> Result<()>; + + fn rename>(&self, src_path: P, dst_path: P) -> Result<()>; + + /// Reuses file at `src_path` as a new file at `dst_path`. The default + /// implementation simply renames the file. + fn reuse>(&self, src_path: P, dst_path: P) -> Result<()> { + self.rename(src_path, dst_path) + } + + #[inline] + fn reuse_and_open>(&self, src_path: P, dst_path: P) -> Result { + self.reuse(src_path.as_ref(), dst_path.as_ref())?; + self.open(dst_path, Permission::ReadWrite) + } + + /// Deletes user implemented metadata associated with `path`. Returns + /// `true` if any metadata is deleted. + /// + /// In older versions of Raft Engine, physical files are deleted without + /// going through user implemented cleanup procedure. This method is used to + /// detect and cleanup the user metadata that is no longer mapped to a + /// physical file. + fn delete_metadata>(&self, _path: P) -> Result<()> { + Ok(()) + } + + /// Returns whether there is any user metadata associated with given `path`. + fn exists_metadata>(&self, _path: P) -> bool { + false + } + + fn new_reader(&self, handle: Arc) -> Result; + + fn new_writer(&self, handle: Arc) -> Result; +} + +pub trait Handle { + fn truncate(&self, offset: usize) -> Result<()>; + + /// Returns the current size of this file. + fn file_size(&self) -> Result; + + fn sync(&self) -> Result<()>; + + /// Sync the specified range in asynchronous. + fn sync_range(&self, offset: usize, nbytes: usize) -> Result<()>; +} + +/// WriteExt is writer extension api +pub trait WriteExt { + fn truncate(&mut self, offset: usize) -> Result<()>; + fn allocate(&mut self, offset: usize, size: usize) -> Result<()>; +} + +#[cfg(test)] +mod tests { + use super::Permission; + + #[test] + fn test_copy_permission() { + let perm = Permission::ReadWrite; + let perm1 = perm; + assert_eq!(perm, Permission::ReadWrite); + assert_eq!(perm1, Permission::ReadWrite); + } +} diff --git a/third/raft-engine/src/env/obfuscated.rs b/third/raft-engine/src/env/obfuscated.rs new file mode 100644 index 00000000..6adaf277 --- /dev/null +++ b/third/raft-engine/src/env/obfuscated.rs @@ -0,0 +1,130 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use std::io::{Read, Result as IoResult, Seek, SeekFrom, Write}; +use std::path::Path; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; + +use crate::env::{DefaultFileSystem, FileSystem, Permission, WriteExt}; + +pub struct ObfuscatedReader(::Reader); + +impl Read for ObfuscatedReader { + fn read(&mut self, buf: &mut [u8]) -> IoResult { + if !buf.is_empty() { + let len = self.0.read(&mut buf[..1])?; + if len == 1 { + buf[0] = buf[0].wrapping_sub(1); + } + Ok(len) + } else { + Ok(0) + } + } +} + +impl Seek for ObfuscatedReader { + fn seek(&mut self, pos: SeekFrom) -> IoResult { + self.0.seek(pos) + } +} + +pub struct ObfuscatedWriter(::Writer); + +impl Write for ObfuscatedWriter { + fn write(&mut self, buf: &[u8]) -> IoResult { + if !buf.is_empty() { + let tmp_vec = vec![buf[0].wrapping_add(1)]; + self.0.write(&tmp_vec) + } else { + Ok(0) + } + } + + fn flush(&mut self) -> IoResult<()> { + self.0.flush() + } +} + +impl Seek for ObfuscatedWriter { + fn seek(&mut self, pos: SeekFrom) -> IoResult { + self.0.seek(pos) + } +} + +impl WriteExt for ObfuscatedWriter { + fn truncate(&mut self, offset: usize) -> IoResult<()> { + self.0.truncate(offset) + } + + fn allocate(&mut self, offset: usize, size: usize) -> IoResult<()> { + self.0.allocate(offset, size) + } +} + +/// `[ObfuscatedFileSystem]` is a special implementation of `[FileSystem]`, +/// which is used for constructing and simulating an abnormal file system for +/// `[Read]` and `[Write]`. +pub struct ObfuscatedFileSystem { + inner: DefaultFileSystem, + files: AtomicUsize, +} + +impl Default for ObfuscatedFileSystem { + fn default() -> Self { + ObfuscatedFileSystem { + inner: DefaultFileSystem, + files: AtomicUsize::new(0), + } + } +} + +impl ObfuscatedFileSystem { + pub fn file_count(&self) -> usize { + self.files.load(Ordering::Relaxed) + } +} + +impl FileSystem for ObfuscatedFileSystem { + type Handle = ::Handle; + type Reader = ObfuscatedReader; + type Writer = ObfuscatedWriter; + + fn create>(&self, path: P) -> IoResult { + let r = self.inner.create(path); + if r.is_ok() { + self.files.fetch_add(1, Ordering::Relaxed); + } + r + } + + fn open>(&self, path: P, perm: Permission) -> IoResult { + self.inner.open(path, perm) + } + + fn delete>(&self, path: P) -> IoResult<()> { + let r = self.inner.delete(path); + if r.is_ok() { + self.files.fetch_sub(1, Ordering::Relaxed); + } + r + } + + fn rename>(&self, src_path: P, dst_path: P) -> IoResult<()> { + self.inner.rename(src_path, dst_path) + } + + fn reuse>(&self, src_path: P, dst_path: P) -> IoResult<()> { + self.delete(src_path)?; + self.create(dst_path)?; + Ok(()) + } + + fn new_reader(&self, handle: Arc) -> IoResult { + Ok(ObfuscatedReader(self.inner.new_reader(handle)?)) + } + + fn new_writer(&self, handle: Arc) -> IoResult { + Ok(ObfuscatedWriter(self.inner.new_writer(handle)?)) + } +} diff --git a/third/raft-engine/src/errors.rs b/third/raft-engine/src/errors.rs new file mode 100644 index 00000000..cfb38e11 --- /dev/null +++ b/third/raft-engine/src/errors.rs @@ -0,0 +1,63 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use std::error; +use std::io::Error as IoError; + +use thiserror::Error; + +use crate::codec::Error as CodecError; + +#[derive(Debug, Error)] +pub enum ProstError { + #[error("Encode Error: {0}")] + Encode(#[from] prost::EncodeError), + #[error("Decode Error: {0}")] + Decode(#[from] prost::DecodeError), +} + +type ProtobufError = ProstError; + +#[derive(Debug, Error)] +pub enum Error { + #[error("Invalid Argument: {0}")] + InvalidArgument(String), + #[error("Corruption: {0}")] + Corruption(String), + #[error("IO Error: {0:?}")] + Io(#[from] IoError), + #[error("Codec Error: {0}")] + Codec(#[from] CodecError), + #[error("Protobuf Error: {0}")] + Protobuf(#[from] ProtobufError), + #[error("TryAgain Error: {0}")] + TryAgain(String), + #[error("Entry Compacted")] + EntryCompacted, + #[error("Entry Not Found")] + EntryNotFound, + #[error("Full")] + Full, + #[error("Other Error: {0}")] + Other(#[from] Box), +} + +pub type Result = ::std::result::Result; + +/// Check whether the given error is a nospace error. +pub(crate) fn is_no_space_err(e: &IoError) -> bool { + // TODO: make the following judgement more elegant when the error type + // `ErrorKind::StorageFull` is stable. + format!("{e}").contains("nospace") +} + +impl From for Error { + fn from(error: prost::EncodeError) -> Self { + ProstError::Encode(error).into() + } +} + +impl From for Error { + fn from(error: prost::DecodeError) -> Self { + ProstError::Decode(error).into() + } +} diff --git a/third/raft-engine/src/event_listener.rs b/third/raft-engine/src/event_listener.rs new file mode 100644 index 00000000..b89296c2 --- /dev/null +++ b/third/raft-engine/src/event_listener.rs @@ -0,0 +1,37 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use crate::pipe_log::{FileBlockHandle, FileId, FileSeq, LogQueue}; + +/// `EventListener` contains a set of callback functions that will be notified +/// on specific events inside Raft Engine. +/// +/// # Threading +/// +/// Different callbacks are called under different threading contexts. +/// [`on_append_log_file`], for example, will be called under a global lock of +/// one specific queue. +/// +/// [`on_append_log_file`]: EventListener::on_append_log_file +pub trait EventListener: Sync + Send { + /// Called *after* a new log file is created. + fn post_new_log_file(&self, _file_id: FileId) {} + + /// Called *before* a [`LogBatch`] has been written into a log file. + /// + /// [`LogBatch`]: crate::log_batch::LogBatch + fn on_append_log_file(&self, _handle: FileBlockHandle) {} + + /// Called *after* a [`LogBatch`] has been applied to the [`MemTable`]. + /// + /// [`LogBatch`]: crate::log_batch::LogBatch + /// [`MemTable`]: crate::memtable::MemTable + fn post_apply_memtables(&self, _file_id: FileId) {} + + /// Returns the oldest file sequence number that are not ready to be purged. + fn first_file_not_ready_for_purge(&self, _queue: LogQueue) -> Option { + None + } + + /// Called *after* a log file is purged. + fn post_purge(&self, _file_id: FileId) {} +} diff --git a/third/raft-engine/src/file_pipe_log/format.rs b/third/raft-engine/src/file_pipe_log/format.rs new file mode 100644 index 00000000..58b6afc3 --- /dev/null +++ b/third/raft-engine/src/file_pipe_log/format.rs @@ -0,0 +1,323 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +//! Representations of objects in filesystem. + +use std::io::BufRead; +use std::path::{Path, PathBuf}; + +use num_traits::{FromPrimitive, ToPrimitive}; + +use crate::codec::{self, NumberEncoder}; +use crate::pipe_log::{FileId, FileSeq, LogQueue, Version}; +use crate::{Error, Result}; + +/// Width to format log sequence number. +const LOG_SEQ_WIDTH: usize = 16; +/// Name suffix for Append queue files. +const LOG_APPEND_SUFFIX: &str = ".raftlog"; +/// Name suffix for Rewrite queue files. +const LOG_REWRITE_SUFFIX: &str = ".rewrite"; +/// Name suffix for reserved log files that contain only zeros. +const LOG_APPEND_RESERVED_SUFFIX: &str = ".raftlog.reserved"; +/// File header. +const LOG_FILE_MAGIC_HEADER: &[u8] = b"RAFT-LOG-FILE-HEADER-9986AB3E47F320B394C8E84916EB0ED5"; + +/// Checks whether the given `buf` is padded with zeros. +/// +/// To simplify the checking strategy, we just check the first +/// and last byte in the `buf`. +/// +/// In most common cases, the paddings will be filled with `0`, +/// and several corner cases, where there exists corrupted blocks +/// in the disk, might pass through this rule, but will failed in +/// followed processing. So, we can just keep it simplistic. +#[inline] +pub(crate) fn is_zero_padded(buf: &[u8]) -> bool { + buf.is_empty() || (buf[0] == 0 && buf[buf.len() - 1] == 0) +} + +/// `FileNameExt` offers file name formatting extensions to [`FileId`]. +pub trait FileNameExt: Sized { + fn parse_file_name(file_name: &str) -> Option; + + fn build_file_name(&self) -> String; + + fn build_file_path>(&self, dir: P) -> PathBuf { + let mut path = PathBuf::from(dir.as_ref()); + path.push(self.build_file_name()); + path + } +} + +impl FileNameExt for FileId { + fn parse_file_name(file_name: &str) -> Option { + if file_name.len() > LOG_SEQ_WIDTH { + if let Ok(seq) = file_name[..LOG_SEQ_WIDTH].parse::() { + if file_name.ends_with(LOG_APPEND_SUFFIX) { + return Some(FileId { + queue: LogQueue::Append, + seq, + }); + } else if file_name.ends_with(LOG_REWRITE_SUFFIX) { + return Some(FileId { + queue: LogQueue::Rewrite, + seq, + }); + } + } + } + None + } + + fn build_file_name(&self) -> String { + let width = LOG_SEQ_WIDTH; + match self.queue { + LogQueue::Append => format!("{:0width$}{LOG_APPEND_SUFFIX}", self.seq,), + LogQueue::Rewrite => format!("{:0width$}{LOG_REWRITE_SUFFIX}", self.seq,), + } + } +} + +pub fn parse_reserved_file_name(file_name: &str) -> Option { + if file_name.len() > LOG_SEQ_WIDTH { + if let Ok(seq) = file_name[..LOG_SEQ_WIDTH].parse::() { + if file_name.ends_with(LOG_APPEND_RESERVED_SUFFIX) { + // As reserved files are only used for LogQueue::Append, + // we just return the related FileSeq of it. + return Some(seq); + } + } + } + None +} + +pub fn build_reserved_file_name(seq: FileSeq) -> String { + let width = LOG_SEQ_WIDTH; + format!("{seq:0width$}{LOG_APPEND_RESERVED_SUFFIX}",) +} + +/// Path to the lock file under `dir`. +pub(super) fn lock_file_path>(dir: P) -> PathBuf { + let mut path = PathBuf::from(dir.as_ref()); + path.push("LOCK"); + path +} + +/// Log file format. It will be encoded to file header. +#[derive(Copy, Clone, Debug, Eq, PartialEq, Default)] +pub struct LogFileFormat { + pub version: Version, + /// 0 stands for no alignment. + pub alignment: u64, +} + +impl LogFileFormat { + pub fn new(version: Version, alignment: u64) -> Self { + Self { version, alignment } + } + + /// Length of header written on storage. + const fn header_len() -> usize { + LOG_FILE_MAGIC_HEADER.len() + std::mem::size_of::() + } + + const fn payload_len(version: Version) -> usize { + match version { + Version::V1 => 0, + Version::V2 => std::mem::size_of::(), + } + } + + pub const fn max_encoded_len() -> usize { + Self::header_len() + Self::payload_len(Version::V2) + } + + /// Length of whole `LogFileFormat` written on storage. + pub fn encoded_len(version: Version) -> usize { + Self::header_len() + Self::payload_len(version) + } + + /// Decodes a slice of bytes into a `LogFileFormat`. + pub fn decode(buf: &mut &[u8]) -> Result { + let mut format = LogFileFormat::default(); + if !buf.starts_with(LOG_FILE_MAGIC_HEADER) { + return Err(Error::Corruption( + "log file magic header mismatch".to_owned(), + )); + } + buf.consume(LOG_FILE_MAGIC_HEADER.len()); + + let version_u64 = codec::decode_u64(buf)?; + if let Some(version) = Version::from_u64(version_u64) { + format.version = version; + } else { + return Err(Error::Corruption(format!( + "unrecognized log file version: {version_u64}", + ))); + } + + let payload_len = Self::payload_len(format.version); + if buf.len() < payload_len { + return Err(Error::Corruption("missing header payload".to_owned())); + } else if payload_len > 0 { + format.alignment = codec::decode_u64(buf)?; + } + + Ok(format) + } + + /// Encodes this header and appends the bytes to the provided buffer. + pub fn encode(&self, buf: &mut Vec) -> Result<()> { + buf.extend_from_slice(LOG_FILE_MAGIC_HEADER); + buf.encode_u64(self.version.to_u64().unwrap())?; + if Self::payload_len(self.version) > 0 { + buf.encode_u64(self.alignment)?; + } else { + assert_eq!(self.alignment, 0); + } + #[cfg(feature = "failpoints")] + { + // Set header corrupted. + let corrupted = || { + fail::fail_point!("log_file_header::corrupted", |_| true); + false + }; + // Set abnormal DataLayout. + let too_large = || { + fail::fail_point!("log_file_header::too_large", |_| true); + false + }; + // Set corrupted DataLayout for `payload`. + let too_small = || { + fail::fail_point!("log_file_header::too_small", |_| true); + false + }; + if corrupted() { + buf[0] += 1; + } + assert!(!(too_large() && too_small())); + if too_large() { + buf.encode_u64(0_u64)?; + } + if too_small() { + buf.pop(); + } + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::pipe_log::LogFileContext; + use crate::test_util::catch_unwind_silent; + + #[test] + fn test_check_paddings_is_valid() { + // normal buffer + let mut buf = vec![0; 128]; + // len < 8 + assert!(is_zero_padded(&buf[0..6])); + // len == 8 + assert!(is_zero_padded(&buf[120..])); + // len > 8 + assert!(is_zero_padded(&buf)); + + // abnormal buffer + buf[127] = 3_u8; + assert!(is_zero_padded(&buf[0..110])); + assert!(is_zero_padded(&buf[120..125])); + assert!(!is_zero_padded(&buf[124..128])); + assert!(!is_zero_padded(&buf[120..])); + assert!(!is_zero_padded(&buf)); + } + + #[test] + fn test_file_name() { + let file_name: &str = "0000000000000123.raftlog"; + let file_id = FileId { + queue: LogQueue::Append, + seq: 123, + }; + assert_eq!(FileId::parse_file_name(file_name).unwrap(), file_id,); + assert_eq!(file_id.build_file_name(), file_name); + + let file_name: &str = "0000000000000123.rewrite"; + let file_id = FileId { + queue: LogQueue::Rewrite, + seq: 123, + }; + assert_eq!(FileId::parse_file_name(file_name).unwrap(), file_id,); + assert_eq!(file_id.build_file_name(), file_name); + + let invalid_cases = vec!["0000000000000123.log", "123.rewrite"]; + for case in invalid_cases { + assert!(FileId::parse_file_name(case).is_none()); + } + } + + #[test] + fn test_version() { + let version = Version::default(); + assert_eq!(Version::V1.to_u64().unwrap(), version.to_u64().unwrap()); + let version2 = Version::from_u64(1).unwrap(); + assert_eq!(version, version2); + } + + #[test] + fn test_encoding_decoding_file_format() { + fn enc_dec_file_format(file_format: LogFileFormat) -> Result { + let mut buf = Vec::with_capacity( + LogFileFormat::header_len() + LogFileFormat::payload_len(file_format.version), + ); + file_format.encode(&mut buf).unwrap(); + LogFileFormat::decode(&mut &buf[..]) + } + // header with aligned-sized data_layout + { + let mut buf = Vec::with_capacity(LogFileFormat::header_len()); + let version = Version::V2; + let alignment = 4096; + buf.extend_from_slice(LOG_FILE_MAGIC_HEADER); + buf.encode_u64(version.to_u64().unwrap()).unwrap(); + buf.encode_u64(alignment).unwrap(); + assert_eq!( + LogFileFormat::decode(&mut &buf[..]).unwrap(), + LogFileFormat::new(version, alignment) + ); + } + // header with abnormal version + { + let mut buf = Vec::with_capacity(LogFileFormat::header_len()); + let abnormal_version = 4_u64; /* abnormal version */ + buf.extend_from_slice(LOG_FILE_MAGIC_HEADER); + buf.encode_u64(abnormal_version).unwrap(); + buf.encode_u64(16).unwrap(); + assert!(LogFileFormat::decode(&mut &buf[..]).is_err()); + } + { + let file_format = LogFileFormat::new(Version::default(), 0); + assert_eq!( + LogFileFormat::new(Version::default(), 0), + enc_dec_file_format(file_format).unwrap() + ); + let file_format = LogFileFormat::new(Version::default(), 4096); + assert!(catch_unwind_silent(|| enc_dec_file_format(file_format)).is_err()); + } + } + + #[test] + fn test_file_context() { + let mut file_context = + LogFileContext::new(FileId::dummy(LogQueue::Append), Version::default()); + assert_eq!(file_context.get_signature(), None); + file_context.id.seq = 10; + file_context.version = Version::V2; + assert_eq!(file_context.get_signature().unwrap(), 10); + let abnormal_seq = (file_context.id.seq << 32) + 100_u64; + file_context.id.seq = abnormal_seq; + assert_ne!(file_context.get_signature().unwrap() as u64, abnormal_seq); + assert_eq!(file_context.get_signature().unwrap(), 100); + } +} diff --git a/third/raft-engine/src/file_pipe_log/log_file.rs b/third/raft-engine/src/file_pipe_log/log_file.rs new file mode 100644 index 00000000..aed5414a --- /dev/null +++ b/third/raft-engine/src/file_pipe_log/log_file.rs @@ -0,0 +1,221 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +//! Log file types. + +use std::io::{Read, Result as IoResult, Seek, SeekFrom, Write}; +use std::sync::Arc; + +use fail::fail_point; +use log::warn; + +use crate::env::{FileSystem, Handle, WriteExt}; +use crate::metrics::*; +use crate::pipe_log::FileBlockHandle; +use crate::util::round_down; +use crate::{Error, Result}; + +use super::format::LogFileFormat; + +/// Maximum number of bytes to allocate ahead. +const FILE_ALLOCATE_SIZE: usize = 2 * 1024 * 1024; + +const PAGE_SIZE: usize = 4096; +const SYNC_RANGE_SIZE: usize = 16 * PAGE_SIZE; + +/// Builds a file writer. +/// +/// # Arguments +/// +/// * `handle`: standard handle of a log file. +/// * `format`: format infos of the log file. +/// * `force_reset`: if true => rewrite the header of this file. +pub(super) fn build_file_writer( + system: &F, + handle: Arc, + format: LogFileFormat, + force_reset: bool, +) -> Result> { + let writer = system.new_writer(handle.clone())?; + LogFileWriter::open(handle, writer, format, force_reset) +} + +/// Append-only writer for log file. It also handles the file header write. +pub struct LogFileWriter { + handle: Arc, + writer: F::Writer, + synced: usize, + written: usize, + capacity: usize, +} + +// All APIs provided by `LogFileWriter` are fail-safe, i.e. caller can continue +// using the same "writer" even if the previous operation failed. +impl LogFileWriter { + fn open( + handle: Arc, + writer: F::Writer, + format: LogFileFormat, + force_reset: bool, + ) -> Result { + let file_size = handle.file_size()?; + let mut f = Self { + handle, + writer, + synced: round_down(file_size, PAGE_SIZE), + written: file_size, + capacity: file_size, + }; + // TODO: add tests for file_size in [header_len, max_encoded_len]. + if file_size < LogFileFormat::encoded_len(format.version) || force_reset { + f.write_header(format)?; + } else { + f.writer.seek(SeekFrom::Start(file_size as u64))?; + } + Ok(f) + } + + fn write_header(&mut self, format: LogFileFormat) -> IoResult<()> { + self.writer.rewind()?; + self.written = 0; + self.synced = 0; + let mut buf = Vec::with_capacity(LogFileFormat::encoded_len(format.version)); + format.encode(&mut buf).unwrap(); + self.write(&buf, 0) + } + + pub fn close(&mut self) -> IoResult<()> { + // Necessary to truncate extra zeros from fallocate(). + self.truncate()?; + self.sync() + } + + pub fn truncate(&mut self) -> IoResult<()> { + if self.written < self.capacity { + fail_point!("file_pipe_log::log_file_writer::skip_truncate", |_| { + Ok(()) + }); + self.writer.truncate(self.written)?; + self.capacity = self.written; + } + Ok(()) + } + + pub fn write(&mut self, buf: &[u8], target_size_hint: usize) -> IoResult<()> { + let new_written = self.written + buf.len(); + if self.capacity < new_written { + let _t = StopWatch::new(&*LOG_ALLOCATE_DURATION_HISTOGRAM); + let alloc = std::cmp::min( + FILE_ALLOCATE_SIZE, + target_size_hint.saturating_sub(self.capacity), + ); + let alloc = std::cmp::max(new_written - self.capacity, alloc); + if let Err(e) = self.writer.allocate(self.capacity, alloc) { + warn!("log file allocation failed: {e}"); + } + self.capacity += alloc; + } + LOG_WRITE_BYTES_TOTAL.inc_by(buf.len() as u64); + self.writer.write_all(buf).map_err(|e| { + self.writer + .seek(SeekFrom::Start(self.written as u64)) + .unwrap_or_else(|e| { + panic!("failed to reseek after write failure: {}", e); + }); + e + })?; + self.written = new_written; + if self.written >= self.synced + SYNC_RANGE_SIZE { + debug_assert_eq!(self.synced % PAGE_SIZE, 0); + let nbytes = round_down(self.written - self.synced, PAGE_SIZE); + self.handle.sync_range(self.synced, nbytes)?; + self.synced += nbytes; + } + Ok(()) + } + + pub fn sync(&mut self) -> IoResult<()> { + let _t = StopWatch::new(&*LOG_SYNC_DURATION_HISTOGRAM); + // Panic if sync fails, in case of data loss. + self.handle.sync().unwrap(); + Ok(()) + } + + #[inline] + pub fn offset(&self) -> usize { + self.written + } +} + +/// Build a file reader. +pub(super) fn build_file_reader( + system: &F, + handle: Arc, +) -> Result> { + let reader = system.new_reader(handle.clone())?; + Ok(LogFileReader::open(handle, reader)) +} + +/// Random-access reader for log file. +pub struct LogFileReader { + handle: Arc, + reader: F::Reader, + + offset: u64, +} + +impl LogFileReader { + fn open(handle: Arc, reader: F::Reader) -> LogFileReader { + Self { + handle, + reader, + // Set to an invalid offset to force a reseek at first read. + offset: u64::MAX, + } + } + + /// Function for reading the header of the log file, and return a + /// `[LogFileFormat]`. + /// + /// Attention please, this function would move the `reader.offset` + /// to `0`, that is, the beginning of the file, to parse the + /// related `[LogFileFormat]`. + pub fn parse_format(&mut self) -> Result { + let mut container = vec![0; LogFileFormat::max_encoded_len()]; + let size = self.read_to(0, &mut container)?; + container.truncate(size); + LogFileFormat::decode(&mut container.as_slice()) + } + + pub fn read(&mut self, handle: FileBlockHandle) -> Result> { + let mut buf = vec![0; handle.len]; + let size = self.read_to(handle.offset, &mut buf)?; + buf.truncate(size); + Ok(buf) + } + + /// Polls bytes from the file. Stops only when the buffer is filled or + /// reaching the "end of file". + pub fn read_to(&mut self, offset: u64, mut buf: &mut [u8]) -> Result { + if offset != self.offset { + self.reader.seek(SeekFrom::Start(offset))?; + self.offset = offset; + } + loop { + match self.reader.read(buf) { + Ok(0) => break, + Ok(n) => { + self.offset += n as u64; + buf = &mut buf[n..]; + } + Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} + Err(e) => return Err(Error::Io(e)), + } + } + Ok((self.offset - offset) as usize) + } + + #[inline] + pub fn file_size(&self) -> Result { + Ok(self.handle.file_size()?) + } +} diff --git a/third/raft-engine/src/file_pipe_log/mod.rs b/third/raft-engine/src/file_pipe_log/mod.rs new file mode 100644 index 00000000..c65515a3 --- /dev/null +++ b/third/raft-engine/src/file_pipe_log/mod.rs @@ -0,0 +1,353 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +//! A [`PipeLog`] implementation that stores data in filesystem. +//! +//! [`PipeLog`]: crate::pipe_log::PipeLog + +mod format; +mod log_file; +mod pipe; +mod pipe_builder; +mod reader; + +pub use format::{parse_reserved_file_name, FileNameExt}; +pub use pipe::DualPipes as FilePipeLog; +pub use pipe_builder::{ + DefaultMachineFactory, DualPipesBuilder as FilePipeLogBuilder, RecoveryConfig, ReplayMachine, +}; + +pub mod debug { + //! A set of public utilities used for interacting with log files. + + use std::collections::VecDeque; + use std::path::{Path, PathBuf}; + use std::sync::Arc; + + use crate::env::{FileSystem, Permission}; + use crate::log_batch::LogItem; + use crate::pipe_log::FileId; + use crate::{Error, Result}; + + use super::format::{FileNameExt, LogFileFormat}; + use super::log_file::{LogFileReader, LogFileWriter}; + use super::reader::LogItemBatchFileReader; + + /// Opens a log file for write. When `create` is true, the specified file + /// will be created first if not exists. + #[allow(dead_code)] + pub fn build_file_writer( + file_system: &F, + path: &Path, + format: LogFileFormat, + create: bool, + ) -> Result> { + let fd = if create { + file_system.create(path)? + } else { + file_system.open(path, Permission::ReadWrite)? + }; + let fd = Arc::new(fd); + super::log_file::build_file_writer(file_system, fd, format, create /* force_reset */) + } + + /// Opens a log file for read. + pub fn build_file_reader( + file_system: &F, + path: &Path, + ) -> Result> { + let fd = Arc::new(file_system.open(path, Permission::ReadOnly)?); + super::log_file::build_file_reader(file_system, fd) + } + + /// An iterator over the log items in log files. + pub struct LogItemReader { + system: Arc, + files: VecDeque<(FileId, PathBuf)>, + batch_reader: LogItemBatchFileReader, + items: VecDeque, + } + + impl Iterator for LogItemReader { + type Item = Result; + + fn next(&mut self) -> Option { + self.next() + } + } + + impl LogItemReader { + /// Creates a new log item reader over one specified log file. + pub fn new_file_reader(system: Arc, file: &Path) -> Result { + if !file.is_file() { + return Err(Error::InvalidArgument(format!( + "Not a file: {}", + file.display() + ))); + } + let file_name = file.file_name().unwrap().to_str().unwrap(); + let file_id = FileId::parse_file_name(file_name); + if file_id.is_none() { + return Err(Error::InvalidArgument(format!( + "Invalid log file name: {file_name}" + ))); + } + Ok(Self { + system, + files: vec![(file_id.unwrap(), file.into())].into(), + batch_reader: LogItemBatchFileReader::new(0), + items: VecDeque::new(), + }) + } + + /// Creates a new log item reader over all log files under the + /// specified directory. + pub fn new_directory_reader(system: Arc, dir: &Path) -> Result { + if !dir.is_dir() { + return Err(Error::InvalidArgument(format!( + "Not a directory: {}", + dir.display() + ))); + } + let mut files: Vec<_> = std::fs::read_dir(dir)? + .filter_map(|e| { + if let Ok(e) = e { + let p = e.path(); + if p.is_file() { + if let Some(file_id) = + FileId::parse_file_name(p.file_name().unwrap().to_str().unwrap()) + { + return Some((file_id, p)); + } + } + } + None + }) + .collect(); + files.sort_by_key(|pair| pair.0); + Ok(Self { + system, + files: files.into(), + batch_reader: LogItemBatchFileReader::new(0), + items: VecDeque::new(), + }) + } + + fn next(&mut self) -> Option> { + if self.items.is_empty() { + let next_batch = self.batch_reader.next(); + match next_batch { + Ok(Some(b)) => { + self.items.extend(b.into_items()); + } + Ok(None) => { + if let Err(e) = self.find_next_readable_file() { + self.batch_reader.reset(); + return Some(Err(e)); + } + } + Err(e) => { + self.batch_reader.reset(); + return Some(Err(e)); + } + } + } + self.items.pop_front().map(Ok) + } + + fn find_next_readable_file(&mut self) -> Result<()> { + while let Some((file_id, path)) = self.files.pop_front() { + let reader = build_file_reader(self.system.as_ref(), &path)?; + self.batch_reader.open(file_id, reader)?; + if let Some(b) = self.batch_reader.next()? { + self.items.extend(b.into_items()); + break; + } + } + Ok(()) + } + } + + #[cfg(test)] + mod tests { + use super::*; + use crate::env::DefaultFileSystem; + use crate::log_batch::{Command, LogBatch}; + use crate::pipe_log::{FileBlockHandle, LogFileContext, LogQueue, Version}; + use crate::test_util::{generate_entries, PanicGuard}; + use raft::eraftpb::Entry; + + #[test] + fn test_debug_file_basic() { + let dir = tempfile::Builder::new() + .prefix("test_debug_file_basic") + .tempdir() + .unwrap(); + let mut file_id = FileId { + queue: LogQueue::Rewrite, + seq: 7, + }; + let file_system = Arc::new(DefaultFileSystem); + let entry_data = vec![b'x'; 1024]; + + let mut batches = vec![vec![LogBatch::default()]]; + let mut batch = LogBatch::default(); + batch + .add_entries::(7, &generate_entries(1, 11, Some(&entry_data))) + .unwrap(); + batch.add_command(7, Command::Clean); + batch.put(7, b"key".to_vec(), b"value".to_vec()).unwrap(); + batch.delete(7, b"key2".to_vec()); + batches.push(vec![batch.clone()]); + let mut batch2 = LogBatch::default(); + batch2.put(8, b"key3".to_vec(), b"value".to_vec()).unwrap(); + batch2 + .add_entries::(8, &generate_entries(5, 15, Some(&entry_data))) + .unwrap(); + batches.push(vec![batch, batch2]); + + for bs in batches.iter_mut() { + let file_path = file_id.build_file_path(dir.path()); + // Write a file. + let mut writer = build_file_writer( + file_system.as_ref(), + &file_path, + LogFileFormat::default(), + true, /* create */ + ) + .unwrap(); + let log_file_format = LogFileContext::new(file_id, Version::default()); + for batch in bs.iter_mut() { + let offset = writer.offset() as u64; + let (len, _) = batch + .finish_populate(1 /* compression_threshold */, None) + .unwrap(); + batch.prepare_write(&log_file_format).unwrap(); + writer + .write(batch.encoded_bytes(), 0 /* target_file_hint */) + .unwrap(); + batch.finish_write(FileBlockHandle { + id: file_id, + offset, + len, + }); + } + writer.close().unwrap(); + // Read and verify. + let mut reader = + LogItemReader::new_file_reader(file_system.clone(), &file_path).unwrap(); + for batch in bs { + for item in batch.clone().drain() { + assert_eq!(item, reader.next().unwrap().unwrap()); + } + } + assert!(reader.next().is_none()); + file_id.seq += 1; + } + // Read directory and verify. + let mut reader = LogItemReader::new_directory_reader(file_system, dir.path()).unwrap(); + for bs in batches.iter() { + for batch in bs { + for item in batch.clone().drain() { + assert_eq!(item, reader.next().unwrap().unwrap()); + } + } + } + assert!(reader.next().is_none()) + } + + #[test] + fn test_debug_file_error() { + let dir = tempfile::Builder::new() + .prefix("test_debug_file_error") + .tempdir() + .unwrap(); + let file_system = Arc::new(DefaultFileSystem); + // An unrelated sub-directory. + let unrelated_dir = dir.path().join(Path::new("random_dir")); + std::fs::create_dir(unrelated_dir).unwrap(); + // An unrelated file. + let unrelated_file_path = dir.path().join(Path::new("random_file")); + let _unrelated_file = std::fs::File::create(&unrelated_file_path).unwrap(); + // A corrupted log file. + let corrupted_file_path = FileId::dummy(LogQueue::Append).build_file_path(dir.path()); + let _corrupted_file = std::fs::File::create(corrupted_file_path).unwrap(); + // An empty log file. + let empty_file_path = FileId::dummy(LogQueue::Rewrite).build_file_path(dir.path()); + let mut writer = build_file_writer( + file_system.as_ref(), + &empty_file_path, + LogFileFormat::default(), + true, /* create */ + ) + .unwrap(); + writer.close().unwrap(); + + assert!(LogItemReader::new_file_reader(file_system.clone(), dir.path()).is_err()); + assert!( + LogItemReader::new_file_reader(file_system.clone(), &unrelated_file_path).is_err() + ); + assert!( + LogItemReader::new_directory_reader(file_system.clone(), &empty_file_path).is_err() + ); + LogItemReader::new_file_reader(file_system.clone(), &empty_file_path).unwrap(); + + let mut reader = LogItemReader::new_directory_reader(file_system, dir.path()).unwrap(); + assert!(reader.next().unwrap().is_err()); + assert!(reader.next().is_none()); + } + + #[test] + fn test_recover_from_partial_write() { + let dir = tempfile::Builder::new() + .prefix("test_debug_file_overwrite") + .tempdir() + .unwrap(); + let file_system = Arc::new(DefaultFileSystem); + + let path = FileId::dummy(LogQueue::Append).build_file_path(dir.path()); + + let formats = [ + LogFileFormat::new(Version::V1, 0), + LogFileFormat::new(Version::V2, 1), + ]; + for from in formats { + for to in formats { + for shorter in [true, false] { + if LogFileFormat::encoded_len(to.version) + < LogFileFormat::encoded_len(from.version) + { + continue; + } + let _guard = PanicGuard::with_prompt(format!( + "case: [{from:?}, {to:?}, {shorter:?}]", + )); + let mut writer = build_file_writer( + file_system.as_ref(), + &path, + from, + true, /* create */ + ) + .unwrap(); + let f = std::fs::OpenOptions::new().write(true).open(&path).unwrap(); + let len = writer.offset(); + writer.close().unwrap(); + if shorter { + f.set_len(len as u64 - 1).unwrap(); + } + let mut writer = build_file_writer( + file_system.as_ref(), + &path, + to, + false, /* create */ + ) + .unwrap(); + writer.close().unwrap(); + let mut reader = build_file_reader(file_system.as_ref(), &path).unwrap(); + assert_eq!(reader.parse_format().unwrap(), to); + std::fs::remove_file(&path).unwrap(); + } + } + } + } + } +} diff --git a/third/raft-engine/src/file_pipe_log/pipe.rs b/third/raft-engine/src/file_pipe_log/pipe.rs new file mode 100644 index 00000000..27ea8267 --- /dev/null +++ b/third/raft-engine/src/file_pipe_log/pipe.rs @@ -0,0 +1,730 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use std::collections::VecDeque; +use std::fs::File as StdFile; +use std::path::PathBuf; +use std::sync::Arc; + +use crossbeam::utils::CachePadded; +use fail::fail_point; +use log::error; +use parking_lot::{Mutex, MutexGuard, RwLock}; + +use crate::config::Config; +use crate::env::{FileSystem, Permission}; +use crate::errors::is_no_space_err; +use crate::event_listener::EventListener; +use crate::metrics::*; +use crate::pipe_log::{ + FileBlockHandle, FileId, FileSeq, LogFileContext, LogQueue, PipeLog, ReactiveBytes, +}; +use crate::{perf_context, Error, Result}; + +use super::format::{build_reserved_file_name, FileNameExt, LogFileFormat}; +use super::log_file::build_file_reader; +use super::log_file::{build_file_writer, LogFileWriter}; + +pub type PathId = usize; +pub type Paths = Vec; + +/// Main directory path id. +pub const DEFAULT_PATH_ID: PathId = 0; +/// FileSeq of logs must start from `1` by default to keep backward +/// compatibility. +pub const DEFAULT_FIRST_FILE_SEQ: FileSeq = 1; + +pub struct File { + pub seq: FileSeq, + pub handle: Arc, + pub format: LogFileFormat, + pub path_id: PathId, + pub reserved: bool, +} + +struct WritableFile { + pub seq: FileSeq, + pub writer: LogFileWriter, + pub format: LogFileFormat, +} + +/// A file-based log storage that arranges files as one single queue. +pub(super) struct SinglePipe { + queue: LogQueue, + paths: Paths, + file_system: Arc, + listeners: Vec>, + default_format: LogFileFormat, + target_file_size: usize, + + capacity: usize, + active_files: CachePadded>>>, + /// This contains both reserved files and files recycled from + /// `active_files`. + recycled_files: CachePadded>>>, + + /// The log file opened for write. + /// + /// `writable_file` must be locked first to acquire both `files` and + /// `writable_file` + writable_file: CachePadded>>, +} + +impl Drop for SinglePipe { + fn drop(&mut self) { + let mut writable_file = self.writable_file.lock(); + if let Err(e) = writable_file.writer.close() { + error!("error while closing the active writer: {e}"); + } + let mut recycled_files = self.recycled_files.write(); + let mut next_reserved_seq = recycled_files + .iter() + .rev() + .find_map(|f| if f.reserved { Some(f.seq + 1) } else { None }) + .unwrap_or(DEFAULT_FIRST_FILE_SEQ); + while let Some(f) = recycled_files.pop_back() { + if f.reserved { + break; + } + let file_id = FileId::new(self.queue, f.seq); + let path = file_id.build_file_path(&self.paths[f.path_id]); + let dst = self.paths[0].join(build_reserved_file_name(next_reserved_seq)); + if let Err(e) = self.file_system.reuse(path, dst) { + error!("error while renaming recycled file during shutdown: {}", e); + } else { + next_reserved_seq += 1; + } + } + } +} + +impl SinglePipe { + /// Opens a new [`SinglePipe`]. + pub fn open( + cfg: &Config, + paths: Paths, + file_system: Arc, + listeners: Vec>, + queue: LogQueue, + mut active_files: Vec>, + recycled_files: Vec>, + ) -> Result { + let alignment = || { + fail_point!("file_pipe_log::open::force_set_alignment", |_| { 16 }); + 0 + }; + let default_format = LogFileFormat::new(cfg.format_version, alignment()); + + // Open or create active file. + let no_active_files = active_files.is_empty(); + if no_active_files { + let path_id = find_available_dir(&paths, cfg.target_file_size.0 as usize); + let file_id = FileId::new(queue, DEFAULT_FIRST_FILE_SEQ); + let path = file_id.build_file_path(&paths[path_id]); + active_files.push(File { + seq: file_id.seq, + handle: file_system.create(path)?.into(), + format: default_format, + path_id, + reserved: false, + }); + } + let f = active_files.last().unwrap(); + // If starting from active_files.emtpy(), we should reset the first file with + // given file format. + let writable_file = WritableFile { + seq: f.seq, + writer: build_file_writer( + file_system.as_ref(), + f.handle.clone(), + f.format, + no_active_files, /* force_reset */ + )?, + format: f.format, + }; + + let (len, recycled_len) = (active_files.len(), recycled_files.len()); + for f in active_files.iter() { + for listener in &listeners { + listener.post_new_log_file(FileId { queue, seq: f.seq }); + } + } + + let pipe = Self { + queue, + paths, + file_system, + listeners, + default_format, + target_file_size: cfg.target_file_size.0 as usize, + capacity: if queue == LogQueue::Append { + cfg.recycle_capacity() + } else { + 0 + }, + active_files: RwLock::new(active_files.into()).into(), + recycled_files: RwLock::new(recycled_files.into()).into(), + writable_file: Mutex::new(writable_file).into(), + }; + pipe.flush_metrics(len); + pipe.flush_recycle_metrics(recycled_len); + Ok(pipe) + } + + /// Synchronizes all metadatas associated with the working directory to the + /// filesystem. + fn sync_dir(&self, path_id: PathId) -> Result<()> { + debug_assert!(!self.paths.is_empty()); + + // Skip syncing directory in Windows. Refer to badger's discussion for more + // detail: https://github.com/dgraph-io/badger/issues/699 + // + // Panic if sync calls fail, keep consistent with the behavior of + // `LogFileWriter::sync()`. + #[cfg(not(windows))] + std::fs::File::open(PathBuf::from(&self.paths[path_id])) + .and_then(|d| d.sync_all()) + .unwrap(); + Ok(()) + } + + /// Recycles one obsolete file from the recycled file list and return its + /// [`PathId`] and [`F::Handle`] if success. + fn recycle_file(&self, seq: FileSeq) -> Option> { + let new_file_id = FileId { + seq, + queue: self.queue, + }; + let (recycle_file, recycle_len) = { + let mut recycled_files = self.recycled_files.write(); + (recycled_files.pop_front(), recycled_files.len()) + }; + if let Some(f) = recycle_file { + let fname = if f.reserved { + build_reserved_file_name(f.seq) + } else { + FileId::new(self.queue, f.seq).build_file_name() + }; + let src_path = self.paths[f.path_id].join(fname); + let dst_path = new_file_id.build_file_path(&self.paths[f.path_id]); + if let Err(e) = self.file_system.reuse(&src_path, &dst_path) { + error!("error while trying to reuse recycled file, err: {e}"); + if let Err(e) = self.file_system.delete(&src_path) { + error!("error while trying to delete recycled file, err: {e}"); + } + } else { + self.flush_recycle_metrics(recycle_len); + return match self.file_system.open(&dst_path, Permission::ReadWrite) { + Ok(handle) => Some(Ok((f.path_id, handle))), + Err(e) => Some(Err(e.into())), + }; + } + } + None + } + + /// Creates a new log file according to the given [`FileSeq`]. + fn new_file(&self, seq: FileSeq) -> Result<(PathId, F::Handle)> { + let new_file_id = FileId { + seq, + queue: self.queue, + }; + let path_id = find_available_dir(&self.paths, self.target_file_size); + let path = new_file_id.build_file_path(&self.paths[path_id]); + Ok((path_id, self.file_system.create(path)?)) + } + + /// Returns a shared [`LogFd`] for the specified file sequence number. + fn get_fd(&self, file_seq: FileSeq) -> Result> { + let files = self.active_files.read(); + if !(files[0].seq..files[0].seq + files.len() as u64).contains(&file_seq) { + return Err(Error::Corruption("file seqno out of range".to_owned())); + } + Ok(files[(file_seq - files[0].seq) as usize].handle.clone()) + } + + /// Creates a new file for write, and rotates the active log file. + /// + /// This operation is atomic in face of errors. + fn rotate_imp(&self, writable_file: &mut MutexGuard>) -> Result<()> { + let _t = StopWatch::new(( + &*LOG_ROTATE_DURATION_HISTOGRAM, + perf_context!(log_rotate_duration), + )); + let new_seq = writable_file.seq + 1; + debug_assert!(new_seq > DEFAULT_FIRST_FILE_SEQ); + + writable_file.writer.close()?; + + let (path_id, handle) = self + .recycle_file(new_seq) + .unwrap_or_else(|| self.new_file(new_seq))?; + let f = File:: { + seq: new_seq, + handle: handle.into(), + format: self.default_format, + path_id, + reserved: false, + }; + let mut new_file = WritableFile { + seq: new_seq, + writer: build_file_writer( + self.file_system.as_ref(), + f.handle.clone(), + f.format, + true, /* force_reset */ + )?, + format: f.format, + }; + // File header must be persisted. This way we can recover gracefully if power + // loss before a new entry is written. + new_file.writer.sync()?; + self.sync_dir(path_id)?; + + **writable_file = new_file; + let len = { + let mut files = self.active_files.write(); + files.push_back(f); + files.len() + }; + self.flush_metrics(len); + for listener in &self.listeners { + listener.post_new_log_file(FileId { + queue: self.queue, + seq: new_seq, + }); + } + Ok(()) + } + + /// Synchronizes current states to related metrics. + fn flush_metrics(&self, len: usize) { + match self.queue { + LogQueue::Append => LOG_FILE_COUNT.append.set(len as i64), + LogQueue::Rewrite => LOG_FILE_COUNT.rewrite.set(len as i64), + } + } + + /// Synchronizes current recycled states to related metrics. + fn flush_recycle_metrics(&self, len: usize) { + match self.queue { + LogQueue::Append => RECYCLED_FILE_COUNT.append.set(len as i64), + LogQueue::Rewrite => RECYCLED_FILE_COUNT.rewrite.set(len as i64), + } + } +} + +impl SinglePipe { + fn read_bytes(&self, handle: FileBlockHandle) -> Result> { + let fd = self.get_fd(handle.id.seq)?; + // As the header of each log file already parsed in the processing of loading + // log files, we just need to build the `LogFileReader`. + let mut reader = build_file_reader(self.file_system.as_ref(), fd)?; + reader.read(handle) + } + + fn append(&self, bytes: &mut T) -> Result { + fail_point!("file_pipe_log::append"); + let mut writable_file = self.writable_file.lock(); + if writable_file.writer.offset() >= self.target_file_size { + self.rotate_imp(&mut writable_file)?; + } + + let seq = writable_file.seq; + let format = writable_file.format; + let ctx = LogFileContext { + id: FileId::new(self.queue, seq), + version: format.version, + }; + let writer = &mut writable_file.writer; + + #[cfg(feature = "failpoints")] + { + use crate::util::round_up; + + let corrupted_padding = || { + fail_point!("file_pipe_log::append::corrupted_padding", |_| true); + false + }; + if format.version.has_log_signing() && format.alignment > 0 { + let s_off = round_up(writer.offset(), format.alignment as usize); + if s_off > writer.offset() { + let len = s_off - writer.offset(); + let mut zeros = vec![0; len]; + if corrupted_padding() { + zeros[len - 1] = 8_u8; + } + writer.write(&zeros, self.target_file_size)?; + } + } + } + let start_offset = writer.offset(); + if let Err(e) = writer.write(bytes.as_bytes(&ctx), self.target_file_size) { + writer.truncate()?; + if is_no_space_err(&e) { + // TODO: There exists several corner cases should be tackled if + // `bytes.len()` > `target_file_size`. For example, + // - [1] main-dir has no recycled logs, and spill-dir have several recycled + // logs. + // - [2] main-dir has several recycled logs, and sum(recycled_logs.size()) < + // expected_file_size, but no recycled logs exist in spill-dir. + // - [3] Both main-dir and spill-dir have several recycled logs. + // But as `bytes.len()` is always smaller than `target_file_size` in common + // cases, this issue will be ignored temprorarily. + self.rotate_imp(&mut writable_file)?; + // If there still exists free space for this record, rotate the file + // and return a special TryAgain Err (for retry) to the caller. + return Err(Error::TryAgain(format!( + "error when append [{:?}:{seq}]: {e}", + self.queue, + ))); + } + return Err(Error::Io(e)); + } + let handle = FileBlockHandle { + id: FileId { + queue: self.queue, + seq, + }, + offset: start_offset as u64, + len: writer.offset() - start_offset, + }; + for listener in &self.listeners { + listener.on_append_log_file(handle); + } + Ok(handle) + } + + fn sync(&self) -> Result<()> { + let mut writable_file = self.writable_file.lock(); + let writer = &mut writable_file.writer; + let _t = StopWatch::new(perf_context!(log_sync_duration)); + writer.sync().map_err(Error::Io)?; + Ok(()) + } + + fn file_span(&self) -> (FileSeq, FileSeq) { + let files = self.active_files.read(); + (files[0].seq, files[files.len() - 1].seq) + } + + fn total_size(&self) -> usize { + let (first_seq, last_seq) = self.file_span(); + (last_seq - first_seq + 1) as usize * self.target_file_size + } + + fn rotate(&self) -> Result<()> { + self.rotate_imp(&mut self.writable_file.lock()) + } + + fn purge_to(&self, file_seq: FileSeq) -> Result { + let (len, purged_files) = { + let mut files = self.active_files.write(); + if !(files[0].seq..files[0].seq + files.len() as u64).contains(&file_seq) { + return Err(box_err!("FileSeq out of range, cannot be purged")); + } + let off = (file_seq - files[0].seq) as usize; + let mut tail = files.split_off(off); + std::mem::swap(&mut tail, &mut files); + (files.len(), tail) + }; + let purged_len = purged_files.len(); + if purged_len > 0 { + let remains_capacity = self.capacity.saturating_sub(len); + let mut recycled_len = self.recycled_files.read().len(); + let mut new_recycled = VecDeque::new(); + // We don't rename the append file because on some platform it could cause I/O + // jitters. Instead we best-effort rename them during shutdown to reduce + // recovery time. + for f in purged_files { + let file_id = FileId { + seq: f.seq, + queue: self.queue, + }; + let path = file_id.build_file_path(&self.paths[f.path_id]); + // Recycle purged files whose version meets the requirement. + if f.format.version.has_log_signing() && recycled_len < remains_capacity { + new_recycled.push_back(f); + recycled_len += 1; + continue; + } + // Remove purged files which are out of capacity and files whose version is + // marked not recycled. + self.file_system.delete(path)?; + } + debug_assert!(recycled_len <= remains_capacity); + self.recycled_files.write().append(&mut new_recycled); + self.flush_recycle_metrics(recycled_len); + } + self.flush_metrics(len); + Ok(purged_len) + } +} + +/// A [`PipeLog`] implementation that stores data in filesystem. +pub struct DualPipes { + pipes: [SinglePipe; 2], + + _dir_locks: Vec, +} + +impl DualPipes { + /// Open a new [`DualPipes`]. Assumes the two [`SinglePipe`]s share the + /// same directory, and that directory is locked by `dir_lock`. + pub(super) fn open( + dir_locks: Vec, + appender: SinglePipe, + rewriter: SinglePipe, + ) -> Result { + // TODO: remove this dependency. + debug_assert_eq!(LogQueue::Append as usize, 0); + debug_assert_eq!(LogQueue::Rewrite as usize, 1); + + Ok(Self { + pipes: [appender, rewriter], + _dir_locks: dir_locks, + }) + } + + #[cfg(test)] + pub fn file_system(&self) -> Arc { + self.pipes[0].file_system.clone() + } +} + +impl PipeLog for DualPipes { + #[inline] + fn read_bytes(&self, handle: FileBlockHandle) -> Result> { + self.pipes[handle.id.queue as usize].read_bytes(handle) + } + + #[inline] + fn append( + &self, + queue: LogQueue, + bytes: &mut T, + ) -> Result { + self.pipes[queue as usize].append(bytes) + } + + #[inline] + fn sync(&self, queue: LogQueue) -> Result<()> { + self.pipes[queue as usize].sync() + } + + #[inline] + fn file_span(&self, queue: LogQueue) -> (FileSeq, FileSeq) { + self.pipes[queue as usize].file_span() + } + + #[inline] + fn total_size(&self, queue: LogQueue) -> usize { + self.pipes[queue as usize].total_size() + } + + #[inline] + fn rotate(&self, queue: LogQueue) -> Result<()> { + self.pipes[queue as usize].rotate() + } + + #[inline] + fn purge_to(&self, file_id: FileId) -> Result { + self.pipes[file_id.queue as usize].purge_to(file_id.seq) + } +} + +/// Fetch and return a valid `PathId` of the specific directories. +pub(crate) fn find_available_dir(paths: &Paths, target_size: usize) -> PathId { + fail_point!("file_pipe_log::force_choose_dir", |s| s + .map_or(DEFAULT_PATH_ID, |n| n.parse::().unwrap())); + // Only if one single dir is set by `Config::dir`, can it skip the check of disk + // space usage. + if paths.len() > 1 { + for (t, p) in paths.iter().enumerate() { + if let Ok(disk_stats) = fs2::statvfs(p) { + if target_size <= disk_stats.available_space() as usize { + return t; + } + } + } + } + DEFAULT_PATH_ID +} + +#[cfg(test)] +mod tests { + use std::path::Path; + use tempfile::Builder; + + use super::super::format::LogFileFormat; + use super::super::pipe_builder::lock_dir; + use super::*; + use crate::env::{DefaultFileSystem, ObfuscatedFileSystem}; + use crate::pipe_log::Version; + use crate::util::ReadableSize; + + fn new_test_pipe( + cfg: &Config, + paths: Paths, + queue: LogQueue, + fs: Arc, + ) -> Result> { + SinglePipe::open(cfg, paths, fs, Vec::new(), queue, Vec::new(), Vec::new()) + } + + fn new_test_pipes(cfg: &Config) -> Result> { + DualPipes::open( + vec![lock_dir(&cfg.dir)?], + new_test_pipe( + cfg, + vec![Path::new(&cfg.dir).to_path_buf()], + LogQueue::Append, + Arc::new(DefaultFileSystem), + )?, + new_test_pipe( + cfg, + vec![Path::new(&cfg.dir).to_path_buf()], + LogQueue::Rewrite, + Arc::new(DefaultFileSystem), + )?, + ) + } + + #[test] + fn test_dir_lock() { + let dir = Builder::new().prefix("test_dir_lock").tempdir().unwrap(); + let path = dir.path().to_str().unwrap(); + let cfg = Config { + dir: path.to_owned(), + ..Default::default() + }; + + let _r1 = new_test_pipes(&cfg).unwrap(); + + // Only one thread can hold file lock + let r2 = new_test_pipes(&cfg); + + assert!(format!("{}", r2.err().unwrap()) + .contains("maybe another instance is using this directory")); + } + + #[test] + fn test_pipe_log() { + let dir = Builder::new().prefix("test_pipe_log").tempdir().unwrap(); + let path = dir.path().to_str().unwrap(); + let cfg = Config { + dir: path.to_owned(), + target_file_size: ReadableSize::kb(1), + ..Default::default() + }; + let queue = LogQueue::Append; + + let pipe_log = new_test_pipes(&cfg).unwrap(); + assert_eq!(pipe_log.file_span(queue), (1, 1)); + + let header_size = LogFileFormat::encoded_len(cfg.format_version) as u64; + + // generate file 1, 2, 3 + let content: Vec = vec![b'a'; 1024]; + let file_handle = pipe_log.append(queue, &mut &content).unwrap(); + assert_eq!(file_handle.id.seq, 1); + assert_eq!(file_handle.offset, header_size); + assert_eq!(pipe_log.file_span(queue).1, 1); + + let file_handle = pipe_log.append(queue, &mut &content).unwrap(); + assert_eq!(file_handle.id.seq, 2); + assert_eq!(file_handle.offset, header_size); + assert_eq!(pipe_log.file_span(queue).1, 2); + + pipe_log.rotate(queue).unwrap(); + + // purge file 1 + assert_eq!(pipe_log.purge_to(FileId { queue, seq: 2 }).unwrap(), 1); + assert_eq!(pipe_log.file_span(queue).0, 2); + + // cannot purge active file + assert!(pipe_log.purge_to(FileId { queue, seq: 4 }).is_err()); + + // append position + let s_content = b"short content".to_vec(); + let file_handle = pipe_log.append(queue, &mut &s_content).unwrap(); + assert_eq!(file_handle.id.seq, 3); + assert_eq!(file_handle.offset, header_size); + + let file_handle = pipe_log.append(queue, &mut &s_content).unwrap(); + assert_eq!(file_handle.id.seq, 3); + assert_eq!(file_handle.offset, header_size + s_content.len() as u64); + + let content_readed = pipe_log + .read_bytes(FileBlockHandle { + id: FileId { queue, seq: 3 }, + offset: header_size, + len: s_content.len(), + }) + .unwrap(); + assert_eq!(content_readed, s_content); + // try to fetch abnormal entry + let abnormal_content_readed = pipe_log.read_bytes(FileBlockHandle { + id: FileId { queue, seq: 12 }, // abnormal seq + offset: header_size, + len: s_content.len(), + }); + assert!(abnormal_content_readed.is_err()); + + // leave only 1 file to truncate + pipe_log.purge_to(FileId { queue, seq: 3 }).unwrap(); + assert_eq!(pipe_log.file_span(queue), (3, 3)); + } + + #[test] + fn test_pipe_log_with_recycle() { + let dir = Builder::new() + .prefix("test_pipe_log_with_recycle") + .tempdir() + .unwrap(); + let path = dir.path().to_str().unwrap(); + let cfg = Config { + dir: path.to_owned(), + target_file_size: ReadableSize(1), + // super large capacity for recycling + purge_threshold: ReadableSize::mb(100), + enable_log_recycle: true, + format_version: Version::V2, + ..Default::default() + }; + let queue = LogQueue::Append; + let fs = Arc::new(ObfuscatedFileSystem::default()); + let pipe_log = + new_test_pipe(&cfg, vec![Path::new(&cfg.dir).to_path_buf()], queue, fs).unwrap(); + assert_eq!(pipe_log.file_span(), (1, 1)); + + fn content(i: usize) -> Vec { + vec![(i % (u8::MAX as usize)) as u8; 16] + } + let mut handles = Vec::new(); + for i in 0..10 { + handles.push(pipe_log.append(&mut &content(i)).unwrap()); + pipe_log.sync().unwrap(); + } + pipe_log.rotate().unwrap(); + let (first, last) = pipe_log.file_span(); + // Cannot purge already expired logs or not existsed logs. + assert!(pipe_log.purge_to(first - 1).is_err()); + assert!(pipe_log.purge_to(last + 1).is_err()); + // Retire files. + assert_eq!(pipe_log.purge_to(last).unwrap() as u64, last - first); + // Try to read recycled file. + for handle in handles.into_iter() { + assert!(pipe_log.read_bytes(handle).is_err()); + } + // Try to reuse. + let mut handles = Vec::new(); + for i in 0..10 { + handles.push(pipe_log.append(&mut &content(i + 1)).unwrap()); + pipe_log.sync().unwrap(); + } + // Verify the data. + for (i, handle) in handles.into_iter().enumerate() { + assert_eq!(pipe_log.read_bytes(handle).unwrap(), content(i + 1)); + } + } +} diff --git a/third/raft-engine/src/file_pipe_log/pipe_builder.rs b/third/raft-engine/src/file_pipe_log/pipe_builder.rs new file mode 100644 index 00000000..0608cd8d --- /dev/null +++ b/third/raft-engine/src/file_pipe_log/pipe_builder.rs @@ -0,0 +1,639 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +//! Helper types to recover in-memory states from log files. + +use std::fs::{self, File as StdFile}; +use std::io::Write; +use std::marker::PhantomData; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use std::time::Instant; + +use fs2::FileExt; +use log::{error, info, warn}; +use rayon::prelude::*; + +use crate::config::{Config, RecoveryMode}; +use crate::env::{FileSystem, Handle, Permission}; +use crate::errors::is_no_space_err; +use crate::event_listener::EventListener; +use crate::log_batch::{LogItemBatch, LOG_BATCH_HEADER_LEN}; +use crate::pipe_log::{FileId, FileSeq, LogQueue}; +use crate::util::{Factory, ReadableSize}; +use crate::{Error, Result}; + +use super::format::{ + build_reserved_file_name, lock_file_path, parse_reserved_file_name, FileNameExt, LogFileFormat, +}; +use super::log_file::build_file_reader; +use super::pipe::{ + find_available_dir, DualPipes, File, PathId, Paths, SinglePipe, DEFAULT_FIRST_FILE_SEQ, +}; +use super::reader::LogItemBatchFileReader; + +/// Maximum size for the buffer for prefilling. +const PREFILL_BUFFER_SIZE: usize = ReadableSize::mb(16).0 as usize; + +/// `ReplayMachine` is a type of deterministic state machine that obeys +/// associative law. +/// +/// Sequentially arranged log items can be divided and replayed to several +/// [`ReplayMachine`]s, and their merged state will be the same as when +/// replayed to one single [`ReplayMachine`]. +/// +/// This abstraction is useful for recovery in parallel: a set of log files can +/// be replayed in a divide-and-conquer fashion. +pub trait ReplayMachine: Send { + /// Inputs a batch of log items from the given file to this machine. + /// Returns whether the input sequence up till now is accepted. + fn replay(&mut self, item_batch: LogItemBatch, file_id: FileId) -> Result<()>; + + /// Merges with another [`ReplayMachine`] that has consumed newer log items + /// in the same input sequence. + fn merge(&mut self, rhs: Self, queue: LogQueue) -> Result<()>; +} + +/// A factory of [`ReplayMachine`]s that can be default constructed. +#[derive(Clone, Default)] +pub struct DefaultMachineFactory(PhantomData>); + +impl Factory for DefaultMachineFactory { + fn new_target(&self) -> M { + M::default() + } +} + +/// Container for basic settings on recovery. +pub struct RecoveryConfig { + pub queue: LogQueue, + pub mode: RecoveryMode, + pub concurrency: usize, + pub read_block_size: u64, +} + +/// [`DualPipes`] factory that can also recover other customized memory states. +pub struct DualPipesBuilder { + cfg: Config, + file_system: Arc, + listeners: Vec>, + + /// Only filled after a successful call of `DualPipesBuilder::scan`. + dirs: Paths, + dir_locks: Vec, + + pub(crate) append_file_names: Vec, + pub(crate) rewrite_file_names: Vec, + pub(crate) recycled_file_names: Vec, + + append_files: Vec>, + rewrite_files: Vec>, + recycled_files: Vec>, +} + +impl DualPipesBuilder { + /// Creates a new builder. + pub fn new(cfg: Config, file_system: Arc, listeners: Vec>) -> Self { + Self { + cfg, + file_system, + listeners, + dirs: Vec::new(), + dir_locks: Vec::new(), + append_file_names: Vec::new(), + rewrite_file_names: Vec::new(), + recycled_file_names: Vec::new(), + append_files: Vec::new(), + rewrite_files: Vec::new(), + recycled_files: Vec::new(), + } + } + + /// Scans for all log files under the working directory. The directory will + /// be created if not exists. + pub fn scan(&mut self) -> Result<()> { + self.scan_and_sort(true)?; + + // Open all files with suitable permissions. + self.append_files = Vec::with_capacity(self.append_file_names.len()); + for (i, file_name) in self.append_file_names.iter().enumerate() { + let perm = if i == self.append_file_names.len() - 1 + || self.cfg.recovery_mode == RecoveryMode::TolerateAnyCorruption + { + Permission::ReadWrite + } else { + Permission::ReadOnly + }; + self.append_files.push(File { + seq: file_name.seq, + handle: Arc::new(self.file_system.open(&file_name.path, perm)?), + format: LogFileFormat::default(), + path_id: file_name.path_id, + reserved: false, + }); + } + self.rewrite_files = Vec::with_capacity(self.rewrite_file_names.len()); + for (i, file_name) in self.rewrite_file_names.iter().enumerate() { + let perm = if i == self.rewrite_file_names.len() - 1 + || self.cfg.recovery_mode == RecoveryMode::TolerateAnyCorruption + { + Permission::ReadWrite + } else { + Permission::ReadOnly + }; + self.rewrite_files.push(File { + seq: file_name.seq, + handle: Arc::new(self.file_system.open(&file_name.path, perm)?), + format: LogFileFormat::default(), + path_id: file_name.path_id, + reserved: false, + }); + } + self.recycled_files = Vec::with_capacity(self.recycled_file_names.len()); + for file_name in &self.recycled_file_names { + self.recycled_files.push(File { + seq: file_name.seq, + handle: Arc::new( + self.file_system + .open(&file_name.path, Permission::ReadOnly)?, + ), + format: LogFileFormat::default(), + path_id: file_name.path_id, + reserved: true, + }); + } + + // Validate and clear obsolete metadata and log files. + for (queue, files, is_recycled_file) in [ + (LogQueue::Append, &mut self.append_files, false), + (LogQueue::Rewrite, &mut self.rewrite_files, false), + (LogQueue::Append, &mut self.recycled_files, true), + ] { + // Check the file_list and remove the hole of files. + let mut invalid_idx = 0_usize; + for (i, file_pair) in files.windows(2).enumerate() { + // If there exists a black hole or duplicate scenario on FileSeq, these + // files should be skipped and cleared. + if file_pair[1].seq - file_pair[0].seq != 1 { + invalid_idx = i + 1; + } + } + files.drain(..invalid_idx); + // Try to cleanup stale metadata left by the previous version. + if files.is_empty() || is_recycled_file { + continue; + } + let max_sample = 100; + // Find the first obsolete metadata. + let mut delete_start = None; + for i in 0..max_sample { + let seq = i * files[0].seq / max_sample; + let file_id = FileId { queue, seq }; + for dir in self.dirs.iter() { + if self + .file_system + .exists_metadata(file_id.build_file_path(dir)) + { + delete_start = Some(i.saturating_sub(1) * files[0].seq / max_sample + 1); + break; + } + } + if delete_start.is_some() { + break; + } + } + // Delete metadata starting from the oldest. Abort on error. + let mut cleared = 0_u64; + if let Some(clear_start) = delete_start { + for seq in (clear_start..files[0].seq).rev() { + let file_id = FileId { queue, seq }; + for dir in self.dirs.iter() { + let path = if is_recycled_file { + dir.join(build_reserved_file_name(seq)) + } else { + file_id.build_file_path(dir) + }; + if self.file_system.exists_metadata(&path) { + if let Err(e) = self.file_system.delete_metadata(&path) { + error!("failed to delete metadata of {}: {e}.", path.display()); + break; + } + cleared += 1; + } + } + } + } + if cleared > 0 { + warn!( + "clear {cleared} stale metadata of {queue:?} in range [0, {}).", + files[0].seq, + ); + } + } + Ok(()) + } + + pub(crate) fn scan_and_sort(&mut self, lock: bool) -> Result<()> { + let dir = self.cfg.dir.clone(); + self.scan_dir(&dir, lock)?; + + if let Some(dir) = self.cfg.spill_dir.clone() { + self.scan_dir(&dir, lock)?; + } + + self.append_file_names.sort_by(|a, b| a.seq.cmp(&b.seq)); + self.rewrite_file_names.sort_by(|a, b| a.seq.cmp(&b.seq)); + self.recycled_file_names.sort_by(|a, b| a.seq.cmp(&b.seq)); + Ok(()) + } + + fn scan_dir(&mut self, dir: &str, lock: bool) -> Result<()> { + let dir = Path::new(dir); + if !dir.exists() { + if lock { + info!("Create raft log directory: {}", dir.display()); + fs::create_dir(dir)?; + self.dir_locks.push(lock_dir(dir)?); + } + self.dirs.push(dir.to_path_buf()); + return Ok(()); + } + if !dir.is_dir() { + return Err(box_err!("Not directory: {}", dir.display())); + } + if lock { + self.dir_locks.push(lock_dir(dir)?); + } + self.dirs.push(dir.to_path_buf()); + let path_id = self.dirs.len() - 1; + + fs::read_dir(dir)?.try_for_each(|e| -> Result<()> { + let dir_entry = e?; + let p = dir_entry.path(); + if !p.is_file() { + return Ok(()); + } + let file_name = p.file_name().unwrap().to_str().unwrap(); + match FileId::parse_file_name(file_name) { + Some(FileId { + queue: LogQueue::Append, + seq, + }) => self.append_file_names.push(FileName { + seq, + path: p, + path_id, + }), + Some(FileId { + queue: LogQueue::Rewrite, + seq, + }) => self.rewrite_file_names.push(FileName { + seq, + path: p, + path_id, + }), + _ => { + if let Some(seq) = parse_reserved_file_name(file_name) { + self.recycled_file_names.push(FileName { + seq, + path: p, + path_id, + }) + } + } + } + Ok(()) + }) + } + + /// Reads through log items in all available log files, and replays them to + /// specific [`ReplayMachine`]s that can be constructed via + /// `machine_factory`. + pub fn recover>( + &mut self, + machine_factory: &FA, + ) -> Result<(M, M)> { + if self.append_files.is_empty() && self.rewrite_files.is_empty() { + // Avoid creating a thread pool. + return Ok((machine_factory.new_target(), machine_factory.new_target())); + } + info!( + "recover file pipe log, found {} append files, {} rewrite files", + self.append_files.len(), + self.rewrite_files.len() + ); + let threads = std::cmp::min( + self.cfg.recovery_threads, + self.append_files.len() + self.rewrite_files.len(), + ); + let pool = rayon::ThreadPoolBuilder::new() + .num_threads(threads) + .build() + .unwrap(); + let (append_concurrency, rewrite_concurrency) = + match (self.append_files.len(), self.rewrite_files.len()) { + (a, b) if a > 0 && b > 0 => { + let a_threads = std::cmp::max(1, threads * a / (a + b)); + let b_threads = std::cmp::max(1, threads.saturating_sub(a_threads)); + (a_threads, b_threads) + } + _ => (threads, threads), + }; + let append_recovery_cfg = RecoveryConfig { + queue: LogQueue::Append, + mode: self.cfg.recovery_mode, + concurrency: append_concurrency, + read_block_size: self.cfg.recovery_read_block_size.0, + }; + let rewrite_recovery_cfg = RecoveryConfig { + queue: LogQueue::Rewrite, + concurrency: rewrite_concurrency, + ..append_recovery_cfg + }; + + let append_files = &mut self.append_files; + let rewrite_files = &mut self.rewrite_files; + + let file_system = self.file_system.clone(); + // As the `recover_queue` would update the `LogFileFormat` of each log file + // in `apend_files` and `rewrite_files`, we re-design the implementation on + // `recover_queue` to make it compatiable to concurrent processing + // with ThreadPool. + let (append, rewrite) = pool.join( + || { + DualPipesBuilder::recover_queue_imp( + file_system.clone(), + append_recovery_cfg, + append_files, + machine_factory, + ) + }, + || { + DualPipesBuilder::recover_queue_imp( + file_system.clone(), + rewrite_recovery_cfg, + rewrite_files, + machine_factory, + ) + }, + ); + Ok((append?, rewrite?)) + } + + /// Manually reads through log items in all available log files of the + /// specified queue, and replays them to specific [`ReplayMachine`]s + /// that can be constructed via `machine_factory`. + fn recover_queue_imp>( + file_system: Arc, + recovery_cfg: RecoveryConfig, + files: &mut Vec>, + machine_factory: &FA, + ) -> Result { + if recovery_cfg.concurrency == 0 || files.is_empty() { + return Ok(machine_factory.new_target()); + } + let queue = recovery_cfg.queue; + let concurrency = recovery_cfg.concurrency; + let recovery_mode = recovery_cfg.mode; + let recovery_read_block_size = recovery_cfg.read_block_size as usize; + + let max_chunk_size = std::cmp::max((files.len() + concurrency - 1) / concurrency, 1); + let chunks = files.par_chunks_mut(max_chunk_size); + let chunk_count = chunks.len(); + debug_assert!(chunk_count <= concurrency); + let machine = chunks + .enumerate() + .map(|(index, chunk)| { + let mut reader = + LogItemBatchFileReader::new(recovery_read_block_size); + let mut machine = machine_factory.new_target(); + let file_count = chunk.len(); + for (i, f) in chunk.iter_mut().enumerate() { + let is_last_file = index == chunk_count - 1 && i == file_count - 1; + let file_reader = build_file_reader(file_system.as_ref(), f.handle.clone())?; + match reader.open(FileId { queue, seq: f.seq }, file_reader) { + Err(e) if matches!(e, Error::Io(_)) => return Err(e), + Err(e) => { + // TODO: More reliable tail detection. + if recovery_mode == RecoveryMode::TolerateAnyCorruption + || recovery_mode == RecoveryMode::TolerateTailCorruption + && is_last_file { + warn!( + "Truncating log file due to broken header (queue={:?},seq={}): {}", + queue, f.seq, e + ); + f.handle.truncate(0)?; + f.format = LogFileFormat::default(); + continue; + } else { + error!( + "Failed to open log file due to broken header (queue={:?},seq={}): {}", + queue, f.seq, e + ); + return Err(e); + } + }, + Ok(format) => { + f.format = format; + } + } + let mut pending_item = None; + loop { + match pending_item.unwrap_or_else(|| reader.next()) { + Ok(Some(item_batch)) => { + let next_item = reader.next(); + // This is the last item. Check entries block. + if_chain::if_chain! { + if matches!(next_item, Err(_) | Ok(None)); + if let Some(ei) = item_batch.entry_index(); + let handle = ei.entries.unwrap(); + if let Err(e) = crate::LogBatch::decode_entries_block( + &reader.reader.as_mut().unwrap().read(handle)?, + handle, + ei.compression_type, + ); + then { + let offset = handle.offset as usize - LOG_BATCH_HEADER_LEN; + if recovery_mode == RecoveryMode::AbsoluteConsistency { + error!( + "Failed to open log file due to broken entry (queue={:?},seq={},offset={}): {}", + queue, f.seq, offset, e + ); + return Err(e); + } else { + warn!( + "Truncating log file due to broken entries block (queue={:?},seq={},offset={}): {}", + queue, f.seq, offset, e + ); + f.handle.truncate(offset)?; + f.handle.sync()?; + break; + } + } + } + pending_item = Some(next_item); + machine.replay(item_batch, FileId { queue, seq: f.seq })?; + } + Ok(None) => break, + Err(e) + if recovery_mode == RecoveryMode::TolerateTailCorruption + && is_last_file || recovery_mode == RecoveryMode::TolerateAnyCorruption => + { + warn!( + "Truncating log file due to broken batch (queue={:?},seq={},offset={}): {}", + queue, f.seq, reader.valid_offset(), e + ); + f.handle.truncate(reader.valid_offset())?; + f.handle.sync()?; + break; + } + Err(e) => { + error!( + "Failed to open log file due to broken batch (queue={:?},seq={},offset={}): {}", + queue, f.seq, reader.valid_offset(), e + ); + return Err(e); + } + } + } + } + Ok(machine) + }) + .try_reduce( + || machine_factory.new_target(), + |mut lhs, rhs| { + lhs.merge(rhs, queue)?; + Ok(lhs) + }, + )?; + + Ok(machine) + } + + /// Manually reads through log items in all available log files of the + /// specified `[LogQueue]`, and replays them to specific + /// [`ReplayMachine`]s that can be constructed via `machine_factory`. + #[allow(dead_code)] + pub fn recover_queue>( + &mut self, + file_system: Arc, + recovery_cfg: RecoveryConfig, + replay_machine_factory: &FA, + ) -> Result { + let files = if recovery_cfg.queue == LogQueue::Append { + &mut self.append_files + } else { + &mut self.rewrite_files + }; + DualPipesBuilder::recover_queue_imp( + file_system, + recovery_cfg, + files, + replay_machine_factory, + ) + } + + fn initialize_files(&mut self) -> Result<()> { + let target_file_size = self.cfg.target_file_size.0 as usize; + let mut target = std::cmp::min( + self.cfg.prefill_capacity(), + self.cfg + .recycle_capacity() + .saturating_sub(self.append_files.len()), + ); + let to_create = target.saturating_sub(self.recycled_files.len()); + if to_create > 0 { + let now = Instant::now(); + for _ in 0..to_create { + let seq = self + .recycled_files + .last() + .map(|f| f.seq + 1) + .unwrap_or_else(|| DEFAULT_FIRST_FILE_SEQ); + let path_id = find_available_dir(&self.dirs, target_file_size); + let root_path = &self.dirs[path_id]; + let path = root_path.join(build_reserved_file_name(seq)); + let handle = Arc::new(self.file_system.create(path)?); + let mut writer = self.file_system.new_writer(handle.clone())?; + let mut written = 0; + let buf = vec![0; std::cmp::min(PREFILL_BUFFER_SIZE, target_file_size)]; + while written < target_file_size { + if let Err(e) = writer.write_all(&buf) { + warn!("failed to build reserved file, err: {e}"); + if is_no_space_err(&e) { + warn!("no enough space for preparing reserved logs"); + // Clear partially prepared recycled log list if there has no enough + // space for it. + target = 0; + } + break; + } + written += buf.len(); + } + self.recycled_files.push(File { + seq, + handle, + format: LogFileFormat::default(), + path_id, + reserved: true, + }); + } + info!( + "prefill logs takes {:?}, created {to_create} files", + now.elapsed(), + ); + } + // If target recycled capacity has been changed when restarting by manually + // modifications, such as setting `Config::enable-log-recycle` from TRUE to + // FALSE, setting `Config::prefill-for-recycle` from TRUE to FALSE or + // changing the recycle capacity, we should remove redundant + // recycled files in advance. + while self.recycled_files.len() > target { + let f = self.recycled_files.pop().unwrap(); + let root_path = &self.dirs[f.path_id]; + let path = root_path.join(build_reserved_file_name(f.seq)); + let _ = self.file_system.delete(path); + } + Ok(()) + } + + /// Builds a [`DualPipes`] that contains all available log files. + pub fn finish(mut self) -> Result> { + self.initialize_files()?; + let appender = SinglePipe::open( + &self.cfg, + self.dirs.clone(), + self.file_system.clone(), + self.listeners.clone(), + LogQueue::Append, + self.append_files, + self.recycled_files, + )?; + let rewriter = SinglePipe::open( + &self.cfg, + self.dirs, + self.file_system.clone(), + self.listeners.clone(), + LogQueue::Rewrite, + self.rewrite_files, + Vec::new(), + )?; + DualPipes::open(self.dir_locks, appender, rewriter) + } +} + +/// Creates and exclusively locks a lock file under the given directory. +pub(super) fn lock_dir>(dir: P) -> Result { + let lock_file = StdFile::create(lock_file_path(dir))?; + lock_file.try_lock_exclusive().map_err(|e| { + Error::Other(box_err!( + "Failed to lock file: {}, maybe another instance is using this directory.", + e + )) + })?; + Ok(lock_file) +} + +pub(crate) struct FileName { + pub seq: FileSeq, + pub path: PathBuf, + path_id: PathId, +} diff --git a/third/raft-engine/src/file_pipe_log/reader.rs b/third/raft-engine/src/file_pipe_log/reader.rs new file mode 100644 index 00000000..106ba72f --- /dev/null +++ b/third/raft-engine/src/file_pipe_log/reader.rs @@ -0,0 +1,185 @@ +// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. + +use crate::env::FileSystem; +use crate::log_batch::{LogBatch, LogItemBatch, LOG_BATCH_HEADER_LEN}; +use crate::pipe_log::{FileBlockHandle, FileId, LogFileContext}; +use crate::util::round_up; +use crate::{Error, Result}; + +use super::format::{is_zero_padded, LogFileFormat}; +use super::log_file::LogFileReader; + +/// A reusable reader over [`LogItemBatch`]s in a log file. +pub(super) struct LogItemBatchFileReader { + file_id: Option, + format: Option, + pub(crate) reader: Option>, + size: usize, + + buffer: Vec, + /// File offset of the data contained in `buffer`. + buffer_offset: usize, + /// File offset of the end of last decoded log batch. + valid_offset: usize, + + /// The maximum number of bytes to prefetch. + read_block_size: usize, +} + +impl LogItemBatchFileReader { + /// Creates a new reader. + pub fn new(read_block_size: usize) -> Self { + Self { + file_id: None, + format: None, + reader: None, + size: 0, + + buffer: Vec::new(), + buffer_offset: 0, + valid_offset: 0, + + read_block_size, + } + } + + /// Opens a file that can be accessed through the given reader. + pub fn open(&mut self, file_id: FileId, mut reader: LogFileReader) -> Result { + let format = reader.parse_format()?; + self.valid_offset = LogFileFormat::encoded_len(format.version); + self.file_id = Some(file_id); + self.format = Some(format); + self.size = reader.file_size()?; + self.reader = Some(reader); + self.buffer.clear(); + self.buffer_offset = 0; + Ok(format) + } + + /// Closes any ongoing file access. + pub fn reset(&mut self) { + self.file_id = None; + self.format = None; + self.reader = None; + self.size = 0; + self.buffer.clear(); + self.buffer_offset = 0; + self.valid_offset = 0; + } + + /// Returns the next [`LogItemBatch`] in current opened file. Returns + /// `None` if there is no more data or no opened file. + pub fn next(&mut self) -> Result> { + // TODO: [Fulfilled in writing progress when DIO is open.] + // We should also consider that there might exists broken blocks when DIO + // is open, and the following reading strategy should tolerate reading broken + // blocks until it finds an accessible header of `LogBatch`. + while self.valid_offset < self.size { + let format = self.format.unwrap(); + if self.valid_offset < LOG_BATCH_HEADER_LEN { + return Err(Error::Corruption( + "attempt to read file with broken header".to_owned(), + )); + } + let r = LogBatch::decode_header(&mut self.peek( + self.valid_offset, + LOG_BATCH_HEADER_LEN, + 0, + )?); + if_chain::if_chain! { + if r.is_err(); + if format.alignment > 0; + let aligned_next_offset = round_up(self.valid_offset, format.alignment as usize); + if self.valid_offset != aligned_next_offset; + if is_zero_padded(self.peek(self.valid_offset, aligned_next_offset - self.valid_offset, 0)?); + then { + // In DataLayout::Alignment mode, tail data in the previous block + // may be aligned with paddings, that is '0'. So, we need to + // skip these redundant content and get the next valid header + // of `LogBatch`. + self.valid_offset = aligned_next_offset; + continue; + } + // If we continued with aligned offset and get a parsed err, + // it means that the header is broken or the padding is filled + // with non-zero bytes, and the err will be returned. + } + let (footer_offset, compression_type, len) = r?; + if self.valid_offset + len > self.size { + return Err(Error::Corruption("log batch header broken".to_owned())); + } + let handle = FileBlockHandle { + id: self.file_id.unwrap(), + offset: (self.valid_offset + LOG_BATCH_HEADER_LEN) as u64, + len: footer_offset - LOG_BATCH_HEADER_LEN, + }; + let context = LogFileContext { + id: self.file_id.unwrap(), + version: format.version, + }; + let item_batch = LogItemBatch::decode( + &mut self.peek( + self.valid_offset + footer_offset, + len - footer_offset, + LOG_BATCH_HEADER_LEN, + )?, + handle, + compression_type, + &context, + )?; + self.valid_offset += len; + return Ok(Some(item_batch)); + } + Ok(None) + } + + /// Reads some bytes starting at `offset`. Pulls bytes from the file into + /// its internal buffer if necessary, and attempts to prefetch in that + /// process. + /// + /// Returns a slice of internal buffer with specified size. + fn peek(&mut self, offset: usize, size: usize, prefetch: usize) -> Result<&[u8]> { + debug_assert!(offset >= self.buffer_offset); + let reader = self.reader.as_mut().unwrap(); + let end = self.buffer_offset + self.buffer.len(); + if offset > end { + self.buffer_offset = offset; + self.buffer + .resize(std::cmp::max(size + prefetch, self.read_block_size), 0); + let read = reader.read_to(self.buffer_offset as u64, &mut self.buffer)?; + if read < size { + return Err(Error::Corruption(format!( + "Unexpected eof at {}", + self.buffer_offset + read + ))); + } + self.buffer.truncate(read); + Ok(&self.buffer[..size]) + } else { + let should_read = (offset + size + prefetch).saturating_sub(end); + if should_read > 0 { + let read_offset = self.buffer_offset + self.buffer.len(); + let prev_len = self.buffer.len(); + self.buffer.resize( + prev_len + std::cmp::max(should_read, self.read_block_size), + 0, + ); + let read = reader.read_to(read_offset as u64, &mut self.buffer[prev_len..])?; + if read + prefetch < should_read { + return Err(Error::Corruption(format!( + "Unexpected eof at {}", + read_offset + read, + ))); + } + self.buffer.truncate(prev_len + read); + } + Ok(&self.buffer[offset - self.buffer_offset..offset - self.buffer_offset + size]) + } + } + + /// Returns the offset to the end of verified and decoded data in current + /// file. Returns zero if there is no file opened. + pub fn valid_offset(&self) -> usize { + self.valid_offset + } +} diff --git a/third/raft-engine/src/filter.rs b/third/raft-engine/src/filter.rs new file mode 100644 index 00000000..f992d788 --- /dev/null +++ b/third/raft-engine/src/filter.rs @@ -0,0 +1,436 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use std::path::Path; +use std::sync::Arc; + +use hashbrown::HashMap; +use rhai::{Engine, Scope, AST}; +use scopeguard::{guard, ScopeGuard}; + +use crate::env::FileSystem; +use crate::file_pipe_log::debug::{build_file_reader, build_file_writer}; +use crate::file_pipe_log::{FileNameExt, ReplayMachine}; +use crate::log_batch::{ + Command, EntryIndexes, KeyValue, LogBatch, LogItem, LogItemBatch, LogItemContent, OpType, +}; +use crate::pipe_log::{FileId, LogFileContext, LogQueue}; +use crate::util::Factory; +use crate::{Error, Result}; + +/// `FilterResult` determines how to alter the existing log items in +/// `RhaiFilterMachine`. +#[derive(PartialEq, Eq)] +enum FilterResult { + /// Apply in the usual way. + Default, + /// Ignore all incoming entries or operations. + DiscardIncoming, + /// Delete all existing entries. + DiscardExisting, +} + +impl FilterResult { + fn from_i64(i: i64) -> Self { + match i { + 0 => FilterResult::Default, + 1 => FilterResult::DiscardIncoming, + 2 => FilterResult::DiscardExisting, + _ => unreachable!(), + } + } +} + +/// `RaftGroupState` represents a simplistic view of a Raft Group. +#[derive(Copy, Clone)] +struct RaftGroupState { + pub first_index: u64, + pub count: usize, + pub rewrite_count: usize, +} + +impl RaftGroupState { + /// Removes all data in this Raft Group. + pub fn clear(&mut self) { + self.first_index = 0; + self.count = 0; + self.rewrite_count = 0; + } + + /// Applies `item` from `queue` into this Raft Group state. + pub fn apply(&mut self, queue: LogQueue, item: &LogItemContent) -> Result<()> { + match item { + LogItemContent::EntryIndexes(EntryIndexes(eis)) => { + if let (Some(first), Some(last)) = (eis.first(), eis.last()) { + let first = first.index; + let last = last.index; + if self.count > 0 { + // hole + if first > self.first_index + self.count as u64 { + if queue == LogQueue::Append { + return Err(Error::Corruption("Encountered hole".to_owned())); + } else { + self.clear(); + } + } + // compacted + if first < self.first_index { + if queue == LogQueue::Append { + return Err(Error::Corruption("Write to compacted".to_owned())); + } else { + self.clear(); + } + } + // non-contiguous rewrites + if queue == LogQueue::Rewrite + && self.first_index + (self.rewrite_count as u64) < first + { + return Err(Error::Corruption( + "Rewrites are not contiguous".to_owned(), + )); + } + } + if self.count == 0 { + // empty + self.first_index = first; + self.count = (last - first + 1) as usize; + self.rewrite_count = if queue == LogQueue::Rewrite { + self.count + } else { + 0 + }; + } else { + self.count = (last - self.first_index + 1) as usize; + if queue == LogQueue::Rewrite { + self.rewrite_count = self.count; + } else { + self.rewrite_count = (first - self.first_index) as usize; + } + } + } + } + LogItemContent::Command(Command::Compact { index }) + if *index > self.first_index && self.count > 0 => + { + if *index < self.first_index + self.count as u64 - 1 { + let deleted = *index - self.first_index; + self.first_index = *index; + self.rewrite_count = self.rewrite_count.saturating_sub(deleted as usize); + } else { + self.clear(); + } + } + LogItemContent::Command(Command::Clean) => self.clear(), + _ => {} + } + Ok(()) + } +} + +/// `RhaiFilter` is a stateless machine that filters incoming log items. Its +/// filtering logic is implemented in Rhai script. +/// Sample script: +/// ```rhai +/// fn filter_append(id, first, count, rewrite_count, queue, ifirst, ilast) { +/// if ifirst < first { +/// return 1; // discard incoming +/// } +/// 0 // default +/// } +/// +/// fn filter_compact(id, first, count, rewrite_count, queue, compact_to) { +/// 0 // default +/// } +/// +/// fn filter_clean(id, first, count, rewrite_count, queue) { +/// if queue == 1 { // rewrite queue +/// return 1; // discard incoming +/// } +/// 0 // default +/// } +/// ``` +struct RhaiFilter { + engine: Arc, + ast: Arc, + scope: Scope<'static>, +} + +impl RhaiFilter { + /// Filters `new_item_content` from `new_item_queue` intended to be applied + /// to the Raft Group. + pub fn filter( + &mut self, + raft_group_id: u64, + state: RaftGroupState, + new_item_queue: LogQueue, + new_item_content: &LogItemContent, + ) -> Result { + let res = match new_item_content { + LogItemContent::EntryIndexes(EntryIndexes(eis)) if !eis.is_empty() => { + self.engine.call_fn( + &mut self.scope, + &self.ast, + "filter_append", + ( + raft_group_id as i64, + state.first_index as i64, + state.count as i64, + state.rewrite_count as i64, + new_item_queue as i64, + eis.first().unwrap().index as i64, + eis.last().unwrap().index as i64, + ), + ) + } + LogItemContent::Command(Command::Compact { index }) => self.engine.call_fn( + &mut self.scope, + &self.ast, + "filter_compact", + ( + raft_group_id as i64, + state.first_index as i64, + state.count as i64, + state.rewrite_count as i64, + new_item_queue as i64, + *index as i64, + ), + ), + LogItemContent::Command(Command::Clean) => self.engine.call_fn( + &mut self.scope, + &self.ast, + "filter_clean", + ( + raft_group_id as i64, + state.first_index as i64, + state.count as i64, + state.rewrite_count as i64, + new_item_queue as i64, + ), + ), + _ => Ok(0), + }; + match res { + Ok(n) => Ok(FilterResult::from_i64(n)), + Err(e) => { + if matches!(*e, rhai::EvalAltResult::ErrorFunctionNotFound(_, _)) { + Ok(FilterResult::Default) + } else { + Err(Error::Corruption(e.to_string())) + } + } + } + } +} + +struct FileAndItems { + file_id: FileId, + items: Vec, + filtered: bool, +} + +/// `RhaiFilterMachine` is a `ReplayMachine` that filters existing log files +/// based on external Rhai script. +pub struct RhaiFilterMachine { + filter: RhaiFilter, + files: Vec, + states: HashMap, +} + +impl RhaiFilterMachine { + fn new(filter: RhaiFilter) -> Self { + Self { + filter, + files: Vec::new(), + states: HashMap::new(), + } + } + + /// Writes out filtered log items and replaces existing log files. Always + /// attempt to recover original log files on error. Panics if that recovery + /// fails. + pub fn finish(self, system: &F, path: &Path) -> Result<()> { + let mut log_batch = LogBatch::default(); + let mut guards = Vec::new(); + for f in self.files.into_iter() { + if f.filtered { + // Backup file and set up a guard to recover on exit. + let target_path = f.file_id.build_file_path(path); + let bak_path = target_path.with_extension("bak"); + system.rename(&target_path, &bak_path)?; + guards.push(( + bak_path.clone(), + guard(f.file_id, |f| { + let original = f.build_file_path(path); + let bak = original.with_extension("bak"); + if bak.exists() { + system.rename(&bak, &original).unwrap_or_else(|e| { + panic!( + "Failed to recover original log file {} ({e}), + you should manually replace it with {}.bak.", + f.build_file_name(), + f.build_file_name(), + ) + }); + } + }), + )); + let mut reader = build_file_reader(system, &bak_path)?; + let format = reader.parse_format()?; + let mut writer = + build_file_writer(system, &target_path, format, true /* create */)?; + let log_file_context = LogFileContext::new(f.file_id, format.version); + // Write out new log file. + for item in f.items.into_iter() { + match item.content { + LogItemContent::EntryIndexes(EntryIndexes(eis)) => { + let mut entries = Vec::with_capacity(eis.len()); + for ei in &eis { + let entries_buf = reader.read(ei.entries.unwrap())?; + let block = LogBatch::decode_entries_block( + &entries_buf, + ei.entries.unwrap(), + ei.compression_type, + )?; + entries.push( + block[ei.entry_offset as usize + ..(ei.entry_offset + ei.entry_len) as usize] + .to_owned(), + ); + } + log_batch.add_raw_entries(item.raft_group_id, eis, entries)?; + } + LogItemContent::Command(cmd) => { + log_batch.add_command(item.raft_group_id, cmd); + } + LogItemContent::Kv(KeyValue { + op_type, + key, + value, + .. + }) => match op_type { + OpType::Put => { + log_batch.put(item.raft_group_id, key, value.unwrap())? + } + OpType::Del => log_batch.delete(item.raft_group_id, key), + }, + } + // Batch 64KB. + if log_batch.approximate_size() >= 64 * 1024 { + log_batch.finish_populate(0 /* compression_threshold */, None)?; + log_batch.prepare_write(&log_file_context)?; + writer.write( + log_batch.encoded_bytes(), + usize::MAX, /* target_size_hint */ + )?; + log_batch.drain(); + } + } + if !log_batch.is_empty() { + log_batch.finish_populate(0 /* compression_threshold */, None)?; + log_batch.prepare_write(&log_file_context)?; + writer.write( + log_batch.encoded_bytes(), + usize::MAX, /* target_size_hint */ + )?; + log_batch.drain(); + } + writer.close()?; + } + } + // Delete backup file and defuse the guard. + for (bak, guard) in guards.into_iter() { + let _ = std::fs::remove_file(bak); + let _ = ScopeGuard::into_inner(guard); + } + Ok(()) + } + + /// Assumes the last element in `self.files` is `file_id`. + fn replay_item(&mut self, item: LogItem, file_id: FileId) -> Result<()> { + let current = self.files.last_mut().unwrap(); + let state = self + .states + .entry(item.raft_group_id) + .or_insert(RaftGroupState { + first_index: 0, + count: 0, + rewrite_count: 0, + }); + let result = + self.filter + .filter(item.raft_group_id, *state, file_id.queue, &item.content)?; + if result == FilterResult::DiscardIncoming { + current.filtered = true; + return Ok(()); + } else if result == FilterResult::DiscardExisting { + current.filtered = true; + state.clear(); + current.items.push(LogItem::new_command( + item.raft_group_id, + Command::Compact { index: u64::MAX }, + )); + } + state.apply(file_id.queue, &item.content)?; + current.items.push(item.clone()); + Ok(()) + } +} + +impl ReplayMachine for RhaiFilterMachine { + fn replay(&mut self, mut item_batch: LogItemBatch, file_id: FileId) -> Result<()> { + if self.files.is_empty() || self.files.last().unwrap().file_id != file_id { + self.files.push(FileAndItems { + file_id, + items: Vec::new(), + filtered: false, + }); + } + for item in item_batch.drain() { + self.replay_item(item, file_id)?; + } + Ok(()) + } + + fn merge(&mut self, rhs: Self, _queue: LogQueue) -> Result<()> { + for f in rhs.files.into_iter() { + if self.files.is_empty() || self.files.last().unwrap().file_id != f.file_id { + self.files.push(FileAndItems { + file_id: f.file_id, + items: Vec::new(), + filtered: f.filtered, + }); + } + for item in f.items.into_iter() { + self.replay_item(item, f.file_id)?; + } + } + Ok(()) + } +} + +pub struct RhaiFilterMachineFactory { + engine: Arc, + ast: Arc, +} + +impl RhaiFilterMachineFactory { + pub fn from_script(script: String) -> Self { + let engine = Engine::new(); + let ast = engine.compile(script).unwrap(); + engine.run_ast_with_scope(&mut Scope::new(), &ast).unwrap(); + Self { + engine: Arc::new(engine), + ast: Arc::new(ast), + } + } +} + +impl Factory for RhaiFilterMachineFactory { + fn new_target(&self) -> RhaiFilterMachine { + let filter = RhaiFilter { + engine: self.engine.clone(), + ast: self.ast.clone(), + scope: Scope::new(), + }; + RhaiFilterMachine::new(filter) + } +} diff --git a/third/raft-engine/src/fork.rs b/third/raft-engine/src/fork.rs new file mode 100644 index 00000000..cab65a92 --- /dev/null +++ b/third/raft-engine/src/fork.rs @@ -0,0 +1,180 @@ +// Copyright (c) 2023-present, PingCAP, Inc. Licensed under Apache-2.0. + +use std::fs::{copy, create_dir_all}; +use std::path::Path; +use std::sync::Arc; + +#[cfg(not(windows))] +use std::os::unix::fs::symlink; +#[cfg(windows)] +use std::os::windows::fs::symlink_file as symlink; + +use crate::config::{Config, RecoveryMode}; +use crate::env::FileSystem; +use crate::file_pipe_log::{FileNameExt, FilePipeLog, FilePipeLogBuilder}; +use crate::pipe_log::{FileId, LogQueue}; +use crate::Engine; + +/// Returned by `Engine::fork`. +#[derive(Default)] +pub struct CopyDetails { + /// Paths of copied log files. + pub copied: Vec, + /// Paths of symlinked log files. + pub symlinked: Vec, +} + +impl Engine> { + /// Make a copy from `source` to `target`. `source` should exists but + /// `target` shouldn't. And `source` shouldn't be opened, otherwise + /// data corruption can happen. + /// + /// *symlink* will be used if possbile, otherwise *copy* will be used + /// instead. Generally all inactive log files will be symlinked, but the + /// last active one will be copied. + /// + /// After the copy is made both of 2 engines can be started and run at the + /// same time. + /// + /// It reports errors if the source instance + /// * is specified with `enable_log_recycle = true`. `source` and `target` + /// can share log files, so log file reusing can cause data corruption. + /// * is specified with `recovery_mode = TolerateAnyCorruption`, in which + /// case *symlink* can't be use. Users should consider to copy the + /// instance directly. + pub fn fork>( + source: &Config, + fs: Arc, + target: T, + ) -> Result { + minimum_copy(source, fs, target) + } +} + +fn minimum_copy(cfg: &Config, fs: Arc, target: P) -> Result +where + F: FileSystem, + P: AsRef, +{ + if cfg.enable_log_recycle { + return Err("enable_log_recycle should be false".to_owned()); + } + if cfg.recovery_mode == RecoveryMode::TolerateAnyCorruption { + return Err("recovery_mode shouldn't be TolerateAnyCorruption".to_owned()); + } + + let mut cfg = cfg.clone(); + cfg.sanitize() + .map_err(|e| format!("sanitize config: {e}"))?; + + create_dir_all(&target) + .map_err(|e| format!("create_dir_all({}): {e}", target.as_ref().display()))?; + + let mut builder = FilePipeLogBuilder::new(cfg.clone(), fs, vec![]); + builder + .scan_and_sort(false) + .map_err(|e| format!("scan files: {e}"))?; + + // Iterate all log files and rewrite files. + let mut details = CopyDetails::default(); + for (queue, files) in [ + (LogQueue::Append, &builder.append_file_names), + (LogQueue::Rewrite, &builder.rewrite_file_names), + ] { + let count = files.len(); + for (i, f) in files.iter().enumerate() { + let src: &Path = f.path.as_ref(); + let dst = FileId::new(queue, f.seq).build_file_path(&target); + if i < count - 1 { + symlink(src, &dst) + .map_err(|e| format!("symlink({}, {}): {e}", src.display(), dst.display()))?; + let path = dst.canonicalize().unwrap().to_str().unwrap().to_owned(); + details.symlinked.push(path); + } else { + copy(src, &dst) + .map(|_| ()) + .map_err(|e| format!("copy({}, {}): {e}", src.display(), dst.display()))?; + let path = dst.canonicalize().unwrap().to_str().unwrap().to_owned(); + details.copied.push(path); + }; + } + } + + Ok(details) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::engine::tests::RaftLogEngine; + use crate::env::DefaultFileSystem; + use crate::{LogBatch, ReadableSize}; + use std::path::PathBuf; + + #[test] + fn test_fork() { + let dir = tempfile::Builder::new() + .prefix("test_engine_fork") + .tempdir() + .unwrap(); + + let mut source = PathBuf::from(dir.as_ref()); + source.push("source"); + let mut cfg = Config { + dir: source.to_str().unwrap().to_owned(), + target_file_size: ReadableSize::kb(1), + enable_log_recycle: false, + ..Default::default() + }; + let engine = RaftLogEngine::open(cfg.clone()).unwrap(); + + let mut log_batch = LogBatch::default(); + log_batch.put(1, vec![b'1'; 16], vec![b'v'; 1024]).unwrap(); + engine.write(&mut log_batch, false).unwrap(); + engine.purge_manager().must_rewrite_append_queue(None, None); + + let mut log_batch = LogBatch::default(); + log_batch.put(1, vec![b'2'; 16], vec![b'v'; 1024]).unwrap(); + engine.write(&mut log_batch, false).unwrap(); + engine.purge_manager().must_rewrite_append_queue(None, None); + + let mut log_batch = LogBatch::default(); + log_batch.put(1, vec![b'3'; 16], vec![b'v'; 1024]).unwrap(); + engine.write(&mut log_batch, false).unwrap(); + + let mut log_batch = LogBatch::default(); + log_batch.put(1, vec![b'4'; 16], vec![b'v'; 1024]).unwrap(); + engine.write(&mut log_batch, false).unwrap(); + + let mut target = PathBuf::from(dir.as_ref()); + target.push("target"); + Engine::<_, _>::fork(&cfg, Arc::new(DefaultFileSystem), &target).unwrap(); + cfg.dir = target.to_str().unwrap().to_owned(); + let engine1 = RaftLogEngine::open(cfg.clone()).unwrap(); + + assert!(engine1.get(1, vec![b'1'; 16].as_ref()).is_some()); + assert!(engine1.get(1, vec![b'2'; 16].as_ref()).is_some()); + assert!(engine1.get(1, vec![b'3'; 16].as_ref()).is_some()); + assert!(engine1.get(1, vec![b'4'; 16].as_ref()).is_some()); + + let mut log_batch = LogBatch::default(); + log_batch.put(1, vec![b'5'; 16], vec![b'v'; 1024]).unwrap(); + engine.write(&mut log_batch, false).unwrap(); + + let mut log_batch = LogBatch::default(); + log_batch.put(1, vec![b'6'; 16], vec![b'v'; 1024]).unwrap(); + engine1.write(&mut log_batch, false).unwrap(); + + assert!(engine.get(1, vec![b'5'; 16].as_ref()).is_some()); + assert!(engine1.get(1, vec![b'6'; 16].as_ref()).is_some()); + + let mut target = PathBuf::from(dir.as_ref()); + target.push("target-1"); + let mut cfg1 = cfg.clone(); + cfg1.enable_log_recycle = true; + assert!(Engine::<_, _>::fork(&cfg1, Arc::new(DefaultFileSystem), &target).is_err()); + let mut cfg1 = cfg; + cfg1.recovery_mode = RecoveryMode::TolerateAnyCorruption; + assert!(Engine::<_, _>::fork(&cfg1, Arc::new(DefaultFileSystem), &target).is_err()); + } +} diff --git a/third/raft-engine/src/lib.rs b/third/raft-engine/src/lib.rs new file mode 100644 index 00000000..bc00a1c4 --- /dev/null +++ b/third/raft-engine/src/lib.rs @@ -0,0 +1,247 @@ +// Copyright (c) 2017-present, PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +//! # Raft Engine + +#![allow(clippy::redundant_closure_call)] +#![allow(unused_imports)] +#![cfg_attr(feature = "nightly", feature(test))] +#![cfg_attr(feature = "swap", feature(allocator_api))] +#![cfg_attr(feature = "swap", feature(slice_ptr_get))] +// Though the new nightly rust stablized this feature, keep it anyway +// because some other project (like TiKV) is still using the old. +#![cfg_attr(feature = "swap", feature(nonnull_slice_from_raw_parts))] +#![cfg_attr(feature = "swap", feature(slice_ptr_len))] +#![cfg_attr(feature = "swap", feature(alloc_layout_extra))] +#![cfg_attr(all(test, feature = "swap"), feature(alloc_error_hook))] +#![cfg_attr(all(test, feature = "swap"), feature(cfg_sanitize))] + +#[macro_use] +extern crate lazy_static; +extern crate scopeguard; +#[cfg(feature = "nightly")] +extern crate test; + +macro_rules! box_err { + ($e:expr) => ({ + use std::error::Error; + let e: Box = format!("[{}:{}]: {}", file!(), line!(), $e).into(); + e.into() + }); + ($f:tt, $($arg:expr),+) => ({ + box_err!(format!($f, $($arg),+)) + }); +} + +mod codec; +mod config; +mod consistency; +mod engine; +mod errors; +mod event_listener; +mod file_pipe_log; +#[cfg(feature = "scripting")] +mod filter; +mod fork; +mod log_batch; +mod memtable; +mod metrics; +mod pipe_log; +mod purge; +#[cfg(feature = "swap")] +mod swappy_allocator; +#[cfg(test)] +mod test_util; +mod util; +mod write_barrier; + +pub mod env; + +pub use config::{Config, RecoveryMode}; +pub use engine::Engine; +pub use errors::{Error, Result}; +pub use log_batch::{Command, LogBatch, MessageExt}; +pub use metrics::{get_perf_context, set_perf_context, take_perf_context, PerfContext}; +pub use pipe_log::Version; +pub use util::ReadableSize; + +#[cfg(feature = "internals")] +pub mod internals { + /// A selective view of key components in Raft Engine. Exported under the + /// `internals` feature only. + pub use crate::event_listener::*; + pub use crate::file_pipe_log::*; + pub use crate::memtable::*; + pub use crate::pipe_log::*; + pub use crate::purge::*; + #[cfg(feature = "swap")] + pub use crate::swappy_allocator::*; + pub use crate::write_barrier::*; +} + +use std::sync::atomic::{AtomicUsize, Ordering}; + +#[derive(Default)] +pub struct GlobalStats { + live_append_entries: AtomicUsize, + rewrite_entries: AtomicUsize, + deleted_rewrite_entries: AtomicUsize, +} + +impl GlobalStats { + #[inline] + pub fn add(&self, queue: pipe_log::LogQueue, count: usize) { + match queue { + pipe_log::LogQueue::Append => { + self.live_append_entries.fetch_add(count, Ordering::Relaxed); + } + pipe_log::LogQueue::Rewrite => { + self.rewrite_entries.fetch_add(count, Ordering::Relaxed); + } + } + } + + #[inline] + pub fn delete(&self, queue: pipe_log::LogQueue, count: usize) { + match queue { + pipe_log::LogQueue::Append => { + self.live_append_entries.fetch_sub(count, Ordering::Relaxed); + } + pipe_log::LogQueue::Rewrite => { + self.deleted_rewrite_entries + .fetch_add(count, Ordering::Relaxed); + } + } + } + + #[inline] + pub fn rewrite_entries(&self) -> usize { + self.rewrite_entries.load(Ordering::Relaxed) + } + + #[inline] + pub fn deleted_rewrite_entries(&self) -> usize { + self.deleted_rewrite_entries.load(Ordering::Relaxed) + } + + #[inline] + pub fn reset_rewrite_counters(&self) { + let dop = self.deleted_rewrite_entries.load(Ordering::Relaxed); + self.deleted_rewrite_entries + .fetch_sub(dop, Ordering::Relaxed); + self.rewrite_entries.fetch_sub(dop, Ordering::Relaxed); + } + + #[inline] + pub fn live_entries(&self, queue: pipe_log::LogQueue) -> usize { + match queue { + pipe_log::LogQueue::Append => self.live_append_entries.load(Ordering::Relaxed), + pipe_log::LogQueue::Rewrite => { + let op = self.rewrite_entries.load(Ordering::Relaxed); + let dop = self.deleted_rewrite_entries.load(Ordering::Relaxed); + debug_assert!(op >= dop); + op.saturating_sub(dop) + } + } + } + + #[inline] + pub fn flush_metrics(&self) { + metrics::LOG_ENTRY_COUNT + .rewrite + .set(self.live_entries(pipe_log::LogQueue::Rewrite) as i64); + metrics::LOG_ENTRY_COUNT + .append + .set(self.live_entries(pipe_log::LogQueue::Append) as i64); + } +} + +pub(crate) const INTERNAL_KEY_PREFIX: &[u8] = b"__"; + +#[inline] +#[cfg(test)] +pub(crate) fn make_internal_key(k: &[u8]) -> Vec { + assert!(!k.is_empty()); + let mut v = INTERNAL_KEY_PREFIX.to_vec(); + v.extend_from_slice(k); + v +} + +#[cfg(not(test))] +pub(crate) fn make_internal_key(k: &[u8]) -> Vec { + use log_batch::ATOMIC_GROUP_KEY; + + assert!(k == ATOMIC_GROUP_KEY); + let mut v = INTERNAL_KEY_PREFIX.to_vec(); + v.extend_from_slice(k); + v +} + +/// We ensure internal keys are not visible to the user by: +/// (1) Writing internal keys will be rejected by `LogBatch::put`. +/// (2) Internal keys are filtered out during apply and replay of both queues. +/// This also makes sure future internal keys under the prefix won't become +/// visible after downgrading. +#[inline] +#[cfg(test)] +pub(crate) fn is_internal_key(s: &[u8], ext: Option<&[u8]>) -> bool { + if let Some(ext) = ext { + s.len() == INTERNAL_KEY_PREFIX.len() + ext.len() + && s[..INTERNAL_KEY_PREFIX.len()] == *INTERNAL_KEY_PREFIX + && s[INTERNAL_KEY_PREFIX.len()..] == *ext + } else { + s.len() > INTERNAL_KEY_PREFIX.len() + && s[..INTERNAL_KEY_PREFIX.len()] == *INTERNAL_KEY_PREFIX + } +} + +#[inline] +#[cfg(not(test))] +pub(crate) fn is_internal_key(s: &[u8], ext: Option<&[u8]>) -> bool { + use log_batch::ATOMIC_GROUP_KEY; + + if let Some(ext) = ext { + s.len() == INTERNAL_KEY_PREFIX.len() + ext.len() + && s[..INTERNAL_KEY_PREFIX.len()] == *INTERNAL_KEY_PREFIX + && s[INTERNAL_KEY_PREFIX.len()..] == *ext + } else { + is_internal_key(s, Some(ATOMIC_GROUP_KEY)) + } +} + +#[cfg(test)] +mod tests { + use crate::log_batch::MessageExt; + use raft::eraftpb::Entry; + + #[ctor::ctor] + fn init() { + env_logger::init(); + } + + impl MessageExt for Entry { + type Entry = Entry; + + fn index(e: &Self::Entry) -> u64 { + e.index + } + } + + #[test] + fn test_internal_key() { + let key = crate::make_internal_key(&[0]); + assert!(crate::is_internal_key(&key, None)); + assert!(crate::is_internal_key(&key, Some(&[0]))); + assert!(!crate::is_internal_key(&key, Some(&[1]))); + } +} diff --git a/third/raft-engine/src/log_batch.rs b/third/raft-engine/src/log_batch.rs new file mode 100644 index 00000000..c750acbe --- /dev/null +++ b/third/raft-engine/src/log_batch.rs @@ -0,0 +1,1707 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use std::fmt::Debug; +use std::io::BufRead; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; +use std::{mem, u64}; + +use byteorder::{BigEndian, LittleEndian, ReadBytesExt, WriteBytesExt}; +use log::error; +use num_derive::FromPrimitive; +use num_traits::FromPrimitive; +use prost::Message; + +use crate::codec::{self, NumberEncoder}; +use crate::memtable::EntryIndex; +use crate::metrics::StopWatch; +use crate::pipe_log::{FileBlockHandle, FileId, LogFileContext, ReactiveBytes}; +use crate::util::{crc32, lz4}; +use crate::{perf_context, Error, Result}; + +pub(crate) const LOG_BATCH_HEADER_LEN: usize = 16; +pub(crate) const LOG_BATCH_CHECKSUM_LEN: usize = 4; + +const TYPE_ENTRIES: u8 = 0x01; +const TYPE_COMMAND: u8 = 0x02; +const TYPE_KV: u8 = 0x3; + +const CMD_CLEAN: u8 = 0x01; +const CMD_COMPACT: u8 = 0x02; + +const DEFAULT_LOG_ITEM_BATCH_CAP: usize = 64; +const MAX_LOG_BATCH_BUFFER_CAP: usize = 8 * 1024 * 1024; +// 2GiB, The maximum content length accepted by lz4 compression. +const MAX_LOG_ENTRIES_SIZE_PER_BATCH: usize = i32::MAX as usize; + +/// `MessageExt` trait allows for probing log index from a specific type of +/// protobuf messages. +pub trait MessageExt: Send + Sync { + type Entry: Message + Default + Clone + PartialEq; + + fn index(e: &Self::Entry) -> u64; +} + +/// Types of compression. +#[repr(u8)] +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum CompressionType { + None = 0, + Lz4 = 1, +} + +impl CompressionType { + pub fn from_u8(t: u8) -> Result { + if t <= CompressionType::Lz4 as u8 { + Ok(unsafe { mem::transmute(t) }) + } else { + Err(Error::Corruption(format!( + "Unrecognized compression type: {t}" + ))) + } + } + + pub fn to_u8(self) -> u8 { + self as u8 + } +} + +type SliceReader<'a> = &'a [u8]; + +// Format: +// { count | first index | [ tail offsets ] } +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct EntryIndexes(pub Vec); + +impl EntryIndexes { + pub fn decode(buf: &mut SliceReader, entries_size: &mut u32) -> Result { + let mut count = codec::decode_var_u64(buf)?; + let mut entry_indexes = Vec::with_capacity(count as usize); + let mut index = 0; + if count > 0 { + index = codec::decode_var_u64(buf)?; + } + while count > 0 { + let t = codec::decode_var_u64(buf)?; + let entry_len = (t as u32) - *entries_size; + let entry_index = EntryIndex { + index, + entry_offset: *entries_size, + entry_len, + ..Default::default() + }; + *entries_size += entry_len; + entry_indexes.push(entry_index); + index += 1; + count -= 1; + } + Ok(Self(entry_indexes)) + } + + pub fn encode(&self, buf: &mut Vec) -> Result<()> { + let count = self.0.len() as u64; + buf.encode_var_u64(count)?; + if count > 0 { + buf.encode_var_u64(self.0[0].index)?; + } + for ei in self.0.iter() { + buf.encode_var_u64((ei.entry_offset + ei.entry_len) as u64)?; + } + Ok(()) + } + + fn approximate_size(&self) -> usize { + 8 /*count*/ + if self.0.is_empty() { 0 } else { 8 } /*first index*/ + 8 /*tail offset*/ * self.0.len() + } +} + +// Format: +// { type | (index) } +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum Command { + Clean, + Compact { index: u64 }, +} + +impl Command { + pub fn encode(&self, vec: &mut Vec) { + match *self { + Command::Clean => { + vec.push(CMD_CLEAN); + } + Command::Compact { index } => { + vec.push(CMD_COMPACT); + vec.encode_var_u64(index).unwrap(); + } + } + } + + pub fn decode(buf: &mut SliceReader) -> Result { + let command_type = codec::read_u8(buf)?; + match command_type { + CMD_CLEAN => Ok(Command::Clean), + CMD_COMPACT => { + let index = codec::decode_var_u64(buf)?; + Ok(Command::Compact { index }) + } + _ => Err(Error::Corruption(format!( + "Unrecognized command type: {command_type}" + ))), + } + } + + fn approximate_size(&self) -> usize { + match &self { + Command::Clean => 1, /* type */ + Command::Compact { .. } => 1 + 8, /* type + index */ + } + } +} + +#[repr(u8)] +#[derive(Debug, PartialEq, Eq, Copy, Clone)] +pub enum OpType { + Put = 1, + Del = 2, +} + +impl OpType { + pub fn from_u8(t: u8) -> Result { + if t <= OpType::Del as u8 { + Ok(unsafe { mem::transmute(t) }) + } else { + Err(Error::Corruption(format!("Unrecognized op type: {t}"))) + } + } + + pub fn to_u8(self) -> u8 { + self as u8 + } +} + +// Format: +// { op_type | key len | key | ( value len | value ) } +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct KeyValue { + pub op_type: OpType, + pub key: Vec, + pub value: Option>, + pub file_id: Option, +} + +impl KeyValue { + pub fn new(op_type: OpType, key: Vec, value: Option>) -> KeyValue { + KeyValue { + op_type, + key, + value, + file_id: None, + } + } + + pub fn decode(buf: &mut SliceReader) -> Result { + let op_type = OpType::from_u8(codec::read_u8(buf)?)?; + let k_len = codec::decode_var_u64(buf)? as usize; + let key = &buf[..k_len]; + buf.consume(k_len); + match op_type { + OpType::Put => { + let v_len = codec::decode_var_u64(buf)? as usize; + let value = &buf[..v_len]; + buf.consume(v_len); + Ok(KeyValue::new( + OpType::Put, + key.to_vec(), + Some(value.to_vec()), + )) + } + OpType::Del => Ok(KeyValue::new(OpType::Del, key.to_vec(), None)), + } + } + + pub fn encode(&self, vec: &mut Vec) -> Result<()> { + vec.push(self.op_type.to_u8()); + vec.encode_var_u64(self.key.len() as u64)?; + vec.extend_from_slice(self.key.as_slice()); + match self.op_type { + OpType::Put => { + vec.encode_var_u64(self.value.as_ref().unwrap().len() as u64)?; + vec.extend_from_slice(self.value.as_ref().unwrap().as_slice()); + } + OpType::Del => {} + } + Ok(()) + } + + fn approximate_size(&self) -> usize { + 1 /*op*/ + 8 /*k_len*/ + self.key.len() + 8 /*v_len*/ + self.value.as_ref().map_or_else(|| 0, |v| v.len()) + } +} + +// Format: +// { 8 byte region id | 1 byte type | item } +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct LogItem { + pub raft_group_id: u64, + pub content: LogItemContent, +} + +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum LogItemContent { + EntryIndexes(EntryIndexes), + Command(Command), + Kv(KeyValue), +} + +impl LogItem { + pub fn new_entry_indexes(raft_group_id: u64, entry_indexes: Vec) -> LogItem { + LogItem { + raft_group_id, + content: LogItemContent::EntryIndexes(EntryIndexes(entry_indexes)), + } + } + + pub fn new_command(raft_group_id: u64, command: Command) -> LogItem { + LogItem { + raft_group_id, + content: LogItemContent::Command(command), + } + } + + pub fn new_kv( + raft_group_id: u64, + op_type: OpType, + key: Vec, + value: Option>, + ) -> LogItem { + LogItem { + raft_group_id, + content: LogItemContent::Kv(KeyValue::new(op_type, key, value)), + } + } + + pub fn encode(&self, buf: &mut Vec) -> Result<()> { + buf.encode_var_u64(self.raft_group_id)?; + match &self.content { + LogItemContent::EntryIndexes(entry_indexes) => { + buf.push(TYPE_ENTRIES); + entry_indexes.encode(buf)?; + } + LogItemContent::Command(command) => { + buf.push(TYPE_COMMAND); + command.encode(buf); + } + LogItemContent::Kv(kv) => { + buf.push(TYPE_KV); + kv.encode(buf)?; + } + } + Ok(()) + } + + pub fn decode(buf: &mut SliceReader, entries_size: &mut u32) -> Result { + let raft_group_id = codec::decode_var_u64(buf)?; + let item_type = buf.read_u8()?; + let content = match item_type { + TYPE_ENTRIES => { + let entry_indexes = EntryIndexes::decode(buf, entries_size)?; + LogItemContent::EntryIndexes(entry_indexes) + } + TYPE_COMMAND => { + let cmd = Command::decode(buf)?; + LogItemContent::Command(cmd) + } + TYPE_KV => { + let kv = KeyValue::decode(buf)?; + LogItemContent::Kv(kv) + } + _ => { + return Err(Error::Corruption(format!( + "Unrecognized log item type: {item_type}" + ))); + } + }; + Ok(LogItem { + raft_group_id, + content, + }) + } + + fn approximate_size(&self) -> usize { + match &self.content { + LogItemContent::EntryIndexes(entry_indexes) => { + 8 /*r_id*/ + 1 /*type*/ + entry_indexes.approximate_size() + } + LogItemContent::Command(cmd) => 8 + 1 + cmd.approximate_size(), + LogItemContent::Kv(kv) => 8 + 1 + kv.approximate_size(), + } + } +} + +pub(crate) type LogItemDrain<'a> = std::vec::Drain<'a, LogItem>; + +/// A lean batch of log item, without entry data. +// Format: +// { item count | [items] | crc32 } +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct LogItemBatch { + items: Vec, + item_size: usize, + entries_size: u32, + checksum: u32, +} + +impl Default for LogItemBatch { + fn default() -> Self { + Self::with_capacity(0) + } +} + +impl LogItemBatch { + pub fn with_capacity(cap: usize) -> Self { + Self { + items: Vec::with_capacity(cap), + item_size: 0, + entries_size: 0, + checksum: 0, + } + } + + // TODO: Clean up these interfaces. + pub fn into_items(self) -> Vec { + self.items + } + + pub fn iter(&self) -> std::slice::Iter { + self.items.iter() + } + + pub fn drain(&mut self) -> LogItemDrain { + self.item_size = 0; + self.entries_size = 0; + self.checksum = 0; + self.items.drain(..) + } + + pub fn merge(&mut self, rhs: &mut LogItemBatch) { + for item in &mut rhs.items { + if let LogItemContent::EntryIndexes(entry_indexes) = &mut item.content { + for ei in entry_indexes.0.iter_mut() { + ei.entry_offset += self.entries_size; + } + } + } + self.item_size += rhs.item_size; + rhs.item_size = 0; + self.entries_size += rhs.entries_size; + rhs.entries_size = 0; + self.items.append(&mut rhs.items); + } + + pub(crate) fn finish_populate(&mut self, compression_type: CompressionType) { + for item in self.items.iter_mut() { + if let LogItemContent::EntryIndexes(entry_indexes) = &mut item.content { + for ei in entry_indexes.0.iter_mut() { + ei.compression_type = compression_type; + } + } + } + } + + /// Prepare the `write` by signing a checksum, so-called `signature`, + /// into the encoded buffer corresponding to `LogBatch`. + /// + /// The `signature` is both generated by the given `LogFileContext`. + /// That is, the final checksum of each `LogBatch` consists of this + /// `signature` and the original `checksum` of the contents. + pub(crate) fn prepare_write( + &self, + buf: &mut [u8], + file_context: &LogFileContext, + ) -> Result<()> { + if !buf.is_empty() { + let mut footer_checksum = self.checksum; + // Fill the placeholder (checksum) in `LogItemBatch` by inserting the + // signature into the encoded bytes. + let footer_checksum_offset = buf.len() - LOG_BATCH_CHECKSUM_LEN; + if let Some(signature) = file_context.get_signature() { + // The final checksum is generated by `signature` ***XOR*** + // `original checksum of buf`. + footer_checksum ^= signature; + } + (&mut buf[footer_checksum_offset..]).write_u32::(footer_checksum)?; + } + Ok(()) + } + + pub(crate) fn finish_write(&mut self, handle: FileBlockHandle) { + for item in self.items.iter_mut() { + match &mut item.content { + LogItemContent::EntryIndexes(entry_indexes) => { + for ei in entry_indexes.0.iter_mut() { + // No assert!(ei.entries.is_none): + // It's possible that batch containing rewritten index already + // has entries location. + ei.entries = Some(handle); + } + } + LogItemContent::Kv(kv) => { + debug_assert!(kv.file_id.is_none()); + kv.file_id = Some(handle.id); + } + _ => {} + } + } + } + + pub fn add_entry_indexes(&mut self, region_id: u64, mut entry_indexes: Vec) { + for ei in entry_indexes.iter_mut() { + ei.entry_offset = self.entries_size; + self.entries_size += ei.entry_len; + } + let item = LogItem::new_entry_indexes(region_id, entry_indexes); + self.item_size += item.approximate_size(); + self.items.push(item); + } + + pub fn add_command(&mut self, region_id: u64, cmd: Command) { + let item = LogItem::new_command(region_id, cmd); + self.item_size += item.approximate_size(); + self.items.push(item); + } + + pub fn delete(&mut self, region_id: u64, key: Vec) { + let item = LogItem::new_kv(region_id, OpType::Del, key, None); + self.item_size += item.approximate_size(); + self.items.push(item); + } + + pub fn put_message(&mut self, region_id: u64, key: Vec, s: &S) -> Result<()> { + self.put(region_id, key, s.encode_to_vec()); + Ok(()) + } + + pub fn put(&mut self, region_id: u64, key: Vec, value: Vec) { + let item = LogItem::new_kv(region_id, OpType::Put, key, Some(value)); + self.item_size += item.approximate_size(); + self.items.push(item); + } + + pub fn encode(&mut self, buf: &mut Vec) -> Result<()> { + let offset = buf.len(); + let count = self.items.len() as u64; + buf.encode_var_u64(count)?; + for item in self.items.iter() { + item.encode(buf)?; + } + self.checksum = crc32(&buf[offset..]); + // Just leave a placeholder for the final checksum, which will be filled + // by later `prepare_write(...)` progress. + buf.encode_u32_le(0)?; + Ok(()) + } + + /// Decodes a `LogItemBatch` from bytes of footer. `entries` is the block + /// location of encoded entries. + pub fn decode( + buf: &mut SliceReader, + entries: FileBlockHandle, + compression_type: CompressionType, + file_context: &LogFileContext, + ) -> Result { + // Validate the checksum of each LogItemBatch by the signature. + let checksum = verify_checksum_with_signature(buf, file_context.get_signature())?; + *buf = &buf[..buf.len() - LOG_BATCH_CHECKSUM_LEN]; + let count = codec::decode_var_u64(buf)?; + let mut items = LogItemBatch::with_capacity(count as usize); + let mut entries_size = 0; + for _ in 0..count { + let item = LogItem::decode(buf, &mut entries_size)?; + items.item_size += item.approximate_size(); + items.items.push(item); + } + items.entries_size = entries_size; + + for item in items.items.iter_mut() { + if let LogItemContent::EntryIndexes(entry_indexes) = &mut item.content { + for ei in entry_indexes.0.iter_mut() { + ei.compression_type = compression_type; + ei.entries = Some(entries); + } + } else if let LogItemContent::Kv(kv) = &mut item.content { + kv.file_id = Some(entries.id); + } + } + items.checksum = checksum; + Ok(items) + } + + pub fn approximate_size(&self) -> usize { + 8 /*count*/ + self.item_size + LOG_BATCH_CHECKSUM_LEN + } + + /// Returns the first [`EntryIndex`] appeared in this batch. + pub fn entry_index(&self) -> Option { + for item in &self.items { + if let LogItemContent::EntryIndexes(entry_indexes) = &item.content { + return entry_indexes.0.first().cloned(); + } + } + None + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +enum BufState { + /// Buffer contains header and optionally entries. + /// # Invariants + /// LOG_BATCH_HEADER_LEN <= buf.len() + Open, + /// Buffer contains header, entries and footer; ready to be written. The + /// footer may be signed with extra information depending on the format + /// version. + /// # Content + /// (header_offset, entries_len) + Encoded(usize, usize), + /// Buffer contains header, entries and footer; ready to be written. This + /// state only briefly exists between encoding and writing, user operation + /// will panic under this state. + /// # Content + /// (header_offset, entries_len) + /// # Invariants + /// LOG_BATCH_HEADER_LEN <= buf.len() + Sealed(usize, usize), + /// Buffer is undergoing writes. User operation will panic under this state. + Incomplete, +} + +/// A batch of log items. +/// +/// Encoding format: +/// - header = { u56 len | u8 compression type | u64 item offset } +/// - entries = { [entry..] (optionally compressed) | crc32 } +/// - footer = { item batch } +/// +/// Size restriction: +/// - The total size of log entries must not exceed 2GiB. +/// +/// Error will be raised if a to-be-added log item cannot fit within those +/// limits. +// Calling order: +// 1. Insert some log items +// 2. [`finish_populate`] +// 3. Write to disk with [`encoded_bytes`] +// 4. Update disk location with [`finish_write`] +// 5. Clean up the memory states with [`drain`]. Step 4 can be skipped if the states are not used +// (e.g. to apply memtable). +#[derive(Clone, PartialEq, Eq, Debug)] +pub struct LogBatch { + item_batch: LogItemBatch, + buf_state: BufState, + buf: Vec, +} + +impl Default for LogBatch { + fn default() -> Self { + Self::with_capacity(DEFAULT_LOG_ITEM_BATCH_CAP) + } +} + +impl LogBatch { + /// Creates a new, empty log batch capable of holding at least `cap` log + /// items. + pub fn with_capacity(cap: usize) -> Self { + let mut buf = Vec::with_capacity(4096); + buf.resize(LOG_BATCH_HEADER_LEN, 0); + Self { + item_batch: LogItemBatch::with_capacity(cap), + buf_state: BufState::Open, + buf, + } + } + + /// Moves all log items of `rhs` into `Self`, leaving `rhs` empty. + pub fn merge(&mut self, rhs: &mut Self) -> Result<()> { + debug_assert!(self.buf_state == BufState::Open && rhs.buf_state == BufState::Open); + let max_entries_size = (|| { + fail::fail_point!("log_batch::1kb_entries_size_per_batch", |_| 1024); + MAX_LOG_ENTRIES_SIZE_PER_BATCH + })(); + if !rhs.buf.is_empty() { + if rhs.buf.len() + self.buf.len() > max_entries_size + LOG_BATCH_HEADER_LEN * 2 { + return Err(Error::Full); + } + self.buf_state = BufState::Incomplete; + rhs.buf_state = BufState::Incomplete; + self.buf.extend_from_slice(&rhs.buf[LOG_BATCH_HEADER_LEN..]); + rhs.buf.shrink_to(MAX_LOG_BATCH_BUFFER_CAP); + rhs.buf.truncate(LOG_BATCH_HEADER_LEN); + } + self.item_batch.merge(&mut rhs.item_batch); + self.buf_state = BufState::Open; + rhs.buf_state = BufState::Open; + Ok(()) + } + + /// Adds some protobuf log entries into the log batch. + pub fn add_entries( + &mut self, + region_id: u64, + entries: &[M::Entry], + ) -> Result<()> { + debug_assert!(self.buf_state == BufState::Open); + if entries.is_empty() { + return Ok(()); + } + + let mut entry_indexes = Vec::with_capacity(entries.len()); + self.buf_state = BufState::Incomplete; + let old_buf_len = self.buf.len(); + let max_entries_size = (|| { + fail::fail_point!("log_batch::1kb_entries_size_per_batch", |_| 1024); + MAX_LOG_ENTRIES_SIZE_PER_BATCH + })(); + for e in entries { + let buf_offset = self.buf.len(); + + e.encode(&mut self.buf)?; + if self.buf.len() > max_entries_size + LOG_BATCH_HEADER_LEN { + self.buf.truncate(old_buf_len); + self.buf_state = BufState::Open; + return Err(Error::Full); + } + entry_indexes.push(EntryIndex { + index: M::index(e), + entry_len: (self.buf.len() - buf_offset) as u32, + ..Default::default() + }); + } + self.item_batch.add_entry_indexes(region_id, entry_indexes); + self.buf_state = BufState::Open; + Ok(()) + } + + /// Adds some log entries with specified encoded data into the log batch. + /// Assumes there are the same amount of entry indexes as the encoded data + /// vectors. + pub(crate) fn add_raw_entries( + &mut self, + region_id: u64, + mut entry_indexes: Vec, + entries: Vec>, + ) -> Result<()> { + debug_assert!(entry_indexes.len() == entries.len()); + debug_assert!(self.buf_state == BufState::Open); + if entry_indexes.is_empty() { + return Ok(()); + } + + self.buf_state = BufState::Incomplete; + let old_buf_len = self.buf.len(); + let max_entries_size = (|| { + fail::fail_point!("log_batch::1kb_entries_size_per_batch", |_| 1024); + MAX_LOG_ENTRIES_SIZE_PER_BATCH + })(); + for (ei, e) in entry_indexes.iter_mut().zip(entries.iter()) { + if e.len() + self.buf.len() > max_entries_size + LOG_BATCH_HEADER_LEN { + self.buf.truncate(old_buf_len); + self.buf_state = BufState::Open; + return Err(Error::Full); + } + let buf_offset = self.buf.len(); + self.buf.extend(e); + ei.entry_len = (self.buf.len() - buf_offset) as u32; + } + self.item_batch.add_entry_indexes(region_id, entry_indexes); + self.buf_state = BufState::Open; + Ok(()) + } + + /// Adds a command into the log batch. + pub fn add_command(&mut self, region_id: u64, cmd: Command) { + self.item_batch.add_command(region_id, cmd); + } + + /// Removes a key value pair from the log batch. + pub fn delete(&mut self, region_id: u64, key: Vec) { + self.item_batch.delete(region_id, key); + } + + /// Adds a protobuf key value pair into the log batch. + pub fn put_message(&mut self, region_id: u64, key: Vec, s: &S) -> Result<()> { + if crate::is_internal_key(&key, None) { + return Err(Error::InvalidArgument(format!( + "key prefix `{:?}` reserved for internal use", + crate::INTERNAL_KEY_PREFIX + ))); + } + self.item_batch.put_message(region_id, key, s) + } + + /// Adds a key value pair into the log batch. + pub fn put(&mut self, region_id: u64, key: Vec, value: Vec) -> Result<()> { + if crate::is_internal_key(&key, None) { + return Err(Error::InvalidArgument(format!( + "key prefix `{:?}` reserved for internal use", + crate::INTERNAL_KEY_PREFIX + ))); + } + self.item_batch.put(region_id, key, value); + Ok(()) + } + + pub(crate) fn put_unchecked(&mut self, region_id: u64, key: Vec, value: Vec) { + self.item_batch.put(region_id, key, value); + } + + /// Returns true if the log batch contains no log item. + pub fn is_empty(&self) -> bool { + self.item_batch.items.is_empty() + } + + /// Notifies the completion of log item population. User must not add any + /// more log content after this call. Returns the length of encoded data. + /// + /// Internally, encodes and optionally compresses log entries. Sets the + /// compression type to each entry index. + pub(crate) fn finish_populate( + &mut self, + compression_threshold: usize, + compression_level: Option, + ) -> Result<(usize, f64)> { + let _t = StopWatch::new(perf_context!(log_populating_duration)); + debug_assert!(self.buf_state == BufState::Open); + if self.is_empty() { + self.buf_state = BufState::Encoded(self.buf.len(), 0); + return Ok((0, 0.0)); + } + self.buf_state = BufState::Incomplete; + + // entries + let (header_offset, compression_type, compression_ratio) = if compression_threshold > 0 + && self.buf.len() >= LOG_BATCH_HEADER_LEN + compression_threshold + { + let buf_len = self.buf.len(); + let compression_ratio = lz4::append_compress_block( + &mut self.buf, + LOG_BATCH_HEADER_LEN, + compression_level.unwrap_or(lz4::DEFAULT_LZ4_COMPRESSION_LEVEL), + )?; + ( + buf_len - LOG_BATCH_HEADER_LEN, + CompressionType::Lz4, + compression_ratio, + ) + } else { + (0, CompressionType::None, 0.0) + }; + + // checksum + if self.buf.len() > header_offset + LOG_BATCH_HEADER_LEN { + let checksum = crc32(&self.buf[header_offset + LOG_BATCH_HEADER_LEN..]); + self.buf.encode_u32_le(checksum)?; + } + // `footer_roffset` records the start offset of encoded `self.item_batch` + let footer_roffset = self.buf.len() - header_offset; + + // footer + self.item_batch.encode(&mut self.buf)?; + self.item_batch.finish_populate(compression_type); + + // header + let len = + (((self.buf.len() - header_offset) as u64) << 8) | u64::from(compression_type.to_u8()); + (&mut self.buf[header_offset..header_offset + 8]).write_u64::(len)?; + (&mut self.buf[header_offset + 8..header_offset + 16]) + .write_u64::(footer_roffset as u64)?; + + #[cfg(feature = "failpoints")] + { + let corrupted_items = || { + fail::fail_point!("log_batch::corrupted_items", |_| true); + false + }; + if corrupted_items() { + self.buf[footer_roffset] += 1; + } + let corrupted_entries = || { + fail::fail_point!("log_batch::corrupted_entries", |_| true); + false + }; + if corrupted_entries() { + assert!(footer_roffset > LOG_BATCH_HEADER_LEN); + self.buf[footer_roffset - 1] += 1; + } + } + + self.buf_state = BufState::Encoded(header_offset, footer_roffset - LOG_BATCH_HEADER_LEN); + Ok((self.buf.len() - header_offset, compression_ratio)) + } + + /// Make preparations for the write of `LogBatch`. + #[inline] + pub(crate) fn prepare_write(&mut self, file_context: &LogFileContext) -> Result<()> { + match self.buf_state { + // `BufState::Sealed` means that `LogBatch` is under a repeated state of dumping. + BufState::Encoded(header_offset, entries_len) + | BufState::Sealed(header_offset, entries_len) => { + self.item_batch + .prepare_write(&mut self.buf[header_offset + entries_len..], file_context)?; + self.buf_state = BufState::Sealed(header_offset, entries_len); + } + _ => unreachable!(), + } + Ok(()) + } + + /// Returns a slice of bytes containing encoded data of this log batch. + /// Assumes called after a successful call of [`prepare_write`]. + pub(crate) fn encoded_bytes(&self) -> &[u8] { + match self.buf_state { + BufState::Sealed(header_offset, _) => &self.buf[header_offset..], + _ => unreachable!(), + } + } + + /// Notifies the completion of a storage write with the written location. + /// + /// Internally sets the file locations of each log entry indexes. + pub(crate) fn finish_write(&mut self, mut handle: FileBlockHandle) { + debug_assert!(matches!(self.buf_state, BufState::Sealed(_, _))); + if !self.is_empty() { + // adjust log batch handle to log entries handle. + handle.offset += LOG_BATCH_HEADER_LEN as u64; + match self.buf_state { + BufState::Sealed(_, entries_len) => { + debug_assert!(LOG_BATCH_HEADER_LEN + entries_len < handle.len); + handle.len = entries_len; + } + _ => unreachable!(), + } + } + self.item_batch.finish_write(handle); + } + + /// Consumes log items into an iterator. + pub(crate) fn drain(&mut self) -> LogItemDrain { + debug_assert!(!matches!(self.buf_state, BufState::Incomplete)); + + self.buf.shrink_to(MAX_LOG_BATCH_BUFFER_CAP); + self.buf.truncate(LOG_BATCH_HEADER_LEN); + self.buf_state = BufState::Open; + self.item_batch.drain() + } + + /// Returns approximate encoded size of this log batch. Might be larger + /// than the actual size. + pub fn approximate_size(&self) -> usize { + if self.is_empty() { + 0 + } else { + match self.buf_state { + BufState::Open => { + self.buf.len() + LOG_BATCH_CHECKSUM_LEN + self.item_batch.approximate_size() + } + BufState::Encoded(header_offset, _) => self.buf.len() - header_offset, + BufState::Sealed(header_offset, _) => self.buf.len() - header_offset, + s => { + error!("querying incomplete log batch with state {s:?}"); + 0 + } + } + } + } + + /// Returns header information from some bytes. + /// + /// The information includes: + /// + /// + The offset of log items + /// + The compression type of entries + /// + The total length of this log batch. + pub(crate) fn decode_header(buf: &mut SliceReader) -> Result<(usize, CompressionType, usize)> { + if buf.len() < LOG_BATCH_HEADER_LEN { + return Err(Error::Corruption(format!( + "Log batch header too short: {}", + buf.len() + ))); + } + + let len_and_type = codec::decode_u64(buf)? as usize; + let compression_type = CompressionType::from_u8(len_and_type as u8)?; + let len = len_and_type >> 8; + let offset = codec::decode_u64(buf)? as usize; + if offset > len { + return Err(Error::Corruption( + "Log item offset exceeds log batch length".to_owned(), + )); + } else if offset < LOG_BATCH_HEADER_LEN { + return Err(Error::Corruption( + "Log item offset is smaller than log batch header length".to_owned(), + )); + } + Ok((offset, compression_type, len)) + } + + /// Unfolds bytes of multiple user entries from an encoded block. + pub(crate) fn decode_entries_block( + buf: &[u8], + handle: FileBlockHandle, + compression: CompressionType, + ) -> Result> { + if handle.len > 0 { + let _ = verify_checksum_with_signature(&buf[0..handle.len], None)?; + match compression { + CompressionType::None => Ok(buf[..handle.len - LOG_BATCH_CHECKSUM_LEN].to_owned()), + CompressionType::Lz4 => { + let decompressed = + lz4::decompress_block(&buf[..handle.len - LOG_BATCH_CHECKSUM_LEN])?; + Ok(decompressed) + } + } + } else { + Ok(Vec::new()) + } + } +} + +impl ReactiveBytes for LogBatch { + fn as_bytes(&mut self, ctx: &LogFileContext) -> &[u8] { + self.prepare_write(ctx).unwrap(); + self.encoded_bytes() + } +} + +/// Verifies the checksum of a slice of bytes that sequentially holds data and +/// checksum. The checksum field may be signed by XOR-ing with an u32. +/// +/// Returns the checksum of the buffer without signature. +fn verify_checksum_with_signature(buf: &[u8], signature: Option) -> Result { + if buf.len() <= LOG_BATCH_CHECKSUM_LEN { + return Err(Error::Corruption(format!( + "Content too short {}", + buf.len() + ))); + } + let actual = crc32(&buf[..buf.len() - LOG_BATCH_CHECKSUM_LEN]); + let mut expected = codec::decode_u32_le(&mut &buf[buf.len() - LOG_BATCH_CHECKSUM_LEN..])?; + if let Some(signature) = signature { + expected ^= signature; + } + if actual != expected { + return Err(Error::Corruption(format!( + "Checksum expected {expected} but got {actual}" + ))); + } + Ok(actual) +} + +lazy_static! { + static ref ATOMIC_GROUP_ID: Arc = Arc::new(AtomicU64::new(0)); +} +pub(crate) const ATOMIC_GROUP_KEY: &[u8] = &[0x01]; +// +const ATOMIC_GROUP_VALUE_LEN: usize = 1; + +#[repr(u8)] +#[derive(Clone, Copy, FromPrimitive, Debug, PartialEq)] +pub(crate) enum AtomicGroupStatus { + Begin = 0, + Middle = 1, + End = 2, +} + +impl AtomicGroupStatus { + /// Whether the log batch with `item` belongs to an atomic group. + pub fn parse(item: &LogItem) -> Option<(u64, AtomicGroupStatus)> { + if let LogItemContent::Kv(KeyValue { + op_type, + key, + value, + .. + }) = &item.content + { + if *op_type == OpType::Put + && crate::is_internal_key(key, Some(ATOMIC_GROUP_KEY)) + && value.as_ref().unwrap().len() == ATOMIC_GROUP_VALUE_LEN + { + let value = &mut value.as_ref().unwrap().as_slice(); + return Some(( + item.raft_group_id, + AtomicGroupStatus::from_u8(value[0]).unwrap(), + )); + } + } + None + } +} + +/// Group multiple log batches as an atomic operation. +/// +/// Caveats: +/// (1) The atomicity is provided at persistent level. This means, once an +/// atomic group fails, the in-memory value will be inconsistent with what can +/// be recovered from on-disk data files. +/// (2) The recovery replay order will be different from original write order. +/// Log batches in a completed atomic group will be replayed as if they were +/// written together at an arbitary time point within the group. +/// (3) Atomic group is implemented by embedding normal key-values into user +/// writes. These keys have internal key prefix and will not be replayed into +/// memtable. However, when read by an older version, they will behave as user +/// keys. They may also belong to Raft Group that doesn't exist before. +/// +/// In practice, we only use atomic group for rewrite operation. (In fact, +/// atomic group markers in append queue are simply ignored.) Rewrite doesn't +/// change the value of entries, just locations. So first issue doesn't affect +/// correctness. There could only be one worker doing the rewrite. So second +/// issue doesn't change observed write order because there's no mixed write. +pub(crate) struct AtomicGroupBuilder { + id: u64, + status: Option, +} + +impl Default for AtomicGroupBuilder { + fn default() -> Self { + Self { + // We only care there's no collision between concurrent groups. + id: ATOMIC_GROUP_ID.fetch_add(1, Ordering::Relaxed), + status: None, + } + } +} + +impl AtomicGroupBuilder { + /// Each log batch can only carry one atomic group marker. If multiple are + /// present only the first is recognized. + pub fn begin(&mut self, lb: &mut LogBatch) { + fail::fail_point!("atomic_group::begin"); + assert_eq!(self.status, None); + self.status = Some(AtomicGroupStatus::Begin); + self.flush(lb); + } + + pub fn add(&mut self, lb: &mut LogBatch) { + fail::fail_point!("atomic_group::add"); + assert!(matches!( + self.status, + Some(AtomicGroupStatus::Begin | AtomicGroupStatus::Middle) + )); + self.status = Some(AtomicGroupStatus::Middle); + self.flush(lb); + } + + pub fn end(&mut self, lb: &mut LogBatch) { + assert!(matches!( + self.status, + Some(AtomicGroupStatus::Begin | AtomicGroupStatus::Middle) + )); + self.status = Some(AtomicGroupStatus::End); + self.flush(lb); + } + + #[inline] + fn flush(&self, lb: &mut LogBatch) { + let mut s = Vec::with_capacity(ATOMIC_GROUP_VALUE_LEN); + s.push(self.status.unwrap() as u8); + lb.put_unchecked(self.id, crate::make_internal_key(ATOMIC_GROUP_KEY), s); + } + + #[cfg(test)] + pub fn with_id(id: u64) -> Self { + Self { id, status: None } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::pipe_log::{LogQueue, Version}; + use crate::test_util::{catch_unwind_silent, generate_entries, generate_entry_indexes_opt}; + use raft::eraftpb::Entry; + use strum::IntoEnumIterator; + + fn decode_entries_from_bytes( + buf: &[u8], + entry_indexes: &[EntryIndex], + _encoded: bool, + ) -> Vec { + let mut entries = Vec::with_capacity(entry_indexes.len()); + for ei in entry_indexes { + let block = + LogBatch::decode_entries_block(buf, ei.entries.unwrap(), ei.compression_type) + .unwrap(); + entries.push( + prost::Message::decode( + &block[ei.entry_offset as usize..(ei.entry_offset + ei.entry_len) as usize], + ) + .unwrap(), + ); + } + entries + } + + #[test] + fn test_entry_indexes_enc_dec() { + fn encode_and_decode(entry_indexes: &mut [EntryIndex]) -> EntryIndexes { + let mut entries_size = 0; + for idx in entry_indexes.iter_mut() { + idx.entry_offset = entries_size; + entries_size += idx.entry_len; + } + let entry_indexes = EntryIndexes(entry_indexes.to_vec()); + + let mut encoded = vec![]; + entry_indexes.encode(&mut encoded).unwrap(); + let mut bytes_slice = encoded.as_slice(); + let mut decoded_entries_size = 0; + let decoded_indexes = + EntryIndexes::decode(&mut bytes_slice, &mut decoded_entries_size).unwrap(); + assert_eq!(bytes_slice.len(), 0); + assert!(decoded_indexes.approximate_size() >= encoded.len()); + assert_eq!(decoded_entries_size, entries_size); + decoded_indexes + } + + let entry_indexes = vec![Vec::new(), generate_entry_indexes_opt(7, 17, None)]; + for mut idxs in entry_indexes.into_iter() { + let decoded = encode_and_decode(&mut idxs); + assert_eq!(idxs, decoded.0); + } + + let mut entry_indexes_with_file_id = + generate_entry_indexes_opt(7, 17, Some(FileId::new(LogQueue::Append, 7))); + let mut decoded = encode_and_decode(&mut entry_indexes_with_file_id); + assert_ne!(entry_indexes_with_file_id, decoded.0); + for i in decoded.0.iter_mut() { + i.entries = None; + } + assert_ne!(entry_indexes_with_file_id, decoded.0); + } + + #[test] + fn test_command_enc_dec() { + let cmds = vec![Command::Clean, Command::Compact { index: 7 }]; + let invalid_command_type = 7; + for cmd in cmds.into_iter() { + let mut encoded = vec![]; + cmd.encode(&mut encoded); + let mut bytes_slice = encoded.as_slice(); + let decoded_cmd = Command::decode(&mut bytes_slice).unwrap(); + assert_eq!(bytes_slice.len(), 0); + assert!(decoded_cmd.approximate_size() >= encoded.len()); + assert_eq!(cmd, decoded_cmd); + + encoded[0] = invalid_command_type; + let expected = format!("Unrecognized command type: {invalid_command_type}"); + assert!(matches!( + Command::decode(&mut encoded.as_slice()), + Err(Error::Corruption(m)) if m == expected + )); + } + } + + #[test] + fn test_kv_enc_dec() { + let kvs = vec![ + KeyValue::new(OpType::Put, b"put".to_vec(), Some(b"put_v".to_vec())), + KeyValue::new(OpType::Del, b"del".to_vec(), None), + ]; + let invalid_op_type = 7; + for kv in kvs.into_iter() { + let mut encoded = vec![]; + kv.encode(&mut encoded).unwrap(); + let mut bytes_slice = encoded.as_slice(); + let decoded_kv = KeyValue::decode(&mut bytes_slice).unwrap(); + assert_eq!(bytes_slice.len(), 0); + assert!(decoded_kv.approximate_size() >= encoded.len()); + assert_eq!(kv, decoded_kv); + + encoded[0] = invalid_op_type; + let expected = format!("Unrecognized op type: {invalid_op_type}"); + assert!(matches!( + KeyValue::decode(&mut encoded.as_slice()), + Err(Error::Corruption(m)) if m == expected + )); + } + + let del_with_value = KeyValue::new(OpType::Del, b"del".to_vec(), Some(b"del_v".to_vec())); + let mut encoded = vec![]; + del_with_value.encode(&mut encoded).unwrap(); + let mut bytes_slice = encoded.as_slice(); + let decoded_kv = KeyValue::decode(&mut bytes_slice).unwrap(); + assert_eq!(bytes_slice.len(), 0); + assert!(decoded_kv.value.is_none()); + } + + #[test] + fn test_log_item_enc_dec() { + let items = vec![ + LogItem::new_entry_indexes(7, generate_entry_indexes_opt(7, 17, None)), + LogItem::new_command(17, Command::Compact { index: 7 }), + LogItem::new_kv(27, OpType::Put, b"key".to_vec(), Some(b"value".to_vec())), + ]; + let invalid_log_item_type = 7; + for mut item in items.into_iter() { + let mut entries_size = 0; + if let LogItemContent::EntryIndexes(EntryIndexes(indexes)) = &mut item.content { + for index in indexes.iter_mut() { + index.entry_offset = entries_size; + entries_size += index.entry_len; + } + } + let mut encoded = vec![]; + item.encode(&mut encoded).unwrap(); + let mut bytes_slice = encoded.as_slice(); + let mut decoded_entries_size = 0; + let decoded_item = + LogItem::decode(&mut bytes_slice, &mut decoded_entries_size).unwrap(); + assert_eq!(bytes_slice.len(), 0); + assert_eq!(decoded_entries_size, entries_size); + assert!(decoded_item.approximate_size() >= encoded.len()); + assert_eq!(item, decoded_item); + + // consume raft group id. + bytes_slice = encoded.as_slice(); + codec::decode_var_u64(&mut bytes_slice).unwrap(); + let next_u8 = encoded.len() - bytes_slice.len(); + encoded[next_u8] = invalid_log_item_type; + let expected = format!("Unrecognized log item type: {invalid_log_item_type}"); + assert!(matches!( + LogItem::decode(&mut encoded.as_slice(), &mut decoded_entries_size), + Err(Error::Corruption(m)) if m == expected + )); + } + } + + #[test] + fn test_log_item_batch_enc_dec() { + let mut batches = vec![LogItemBatch::default()]; + let mut batch = LogItemBatch::default(); + batch.add_entry_indexes(7, generate_entry_indexes_opt(1, 5, None /* file_id */)); + batch.add_entry_indexes( + 7 + 100, + generate_entry_indexes_opt(100, 105, None /* file_id */), + ); + batch.add_command(7, Command::Clean); + batch.put(7, b"key".to_vec(), b"value".to_vec()); + batch.delete(7, b"key2".to_vec()); + batches.push(batch); + + for batch in batches.into_iter() { + for compression_type in [CompressionType::Lz4, CompressionType::None] { + let mut batch = batch.clone(); + batch.finish_populate(compression_type); + let mut encoded_batch = vec![]; + batch.encode(&mut encoded_batch).unwrap(); + let file_context = + LogFileContext::new(FileId::dummy(LogQueue::Append), Version::default()); + batch + .prepare_write(&mut encoded_batch, &file_context) + .unwrap(); + batch.finish_write(FileBlockHandle::dummy(LogQueue::Append)); + let decoded_batch = LogItemBatch::decode( + &mut encoded_batch.as_slice(), + FileBlockHandle::dummy(LogQueue::Append), + compression_type, + &file_context, + ) + .unwrap(); + assert!(decoded_batch.approximate_size() >= encoded_batch.len()); + assert_eq!(batch, decoded_batch); + } + } + } + + #[test] + fn test_log_batch_enc_dec() { + fn decode_and_encode( + mut batch: LogBatch, + compress: bool, + version: Version, + entry_data: &[u8], + ) { + // Test call protocol violation. + assert!(catch_unwind_silent(|| batch.encoded_bytes()).is_err()); + assert!(catch_unwind_silent( + || batch.finish_write(FileBlockHandle::dummy(LogQueue::Append)) + ) + .is_err()); + let mocked_file_block_handle = FileBlockHandle { + id: FileId::new(LogQueue::Append, 12), + len: 0, + offset: 0, + }; + let old_approximate_size = batch.approximate_size(); + let (len, _) = batch.finish_populate(usize::from(compress), None).unwrap(); + assert!(old_approximate_size >= len); + assert_eq!(batch.approximate_size(), len); + let mut batch_handle = mocked_file_block_handle; + batch_handle.len = len; + let file_context = LogFileContext::new(batch_handle.id, version); + batch.prepare_write(&file_context).unwrap(); + batch.finish_write(batch_handle); + let encoded = batch.encoded_bytes(); + assert_eq!(encoded.len(), len); + if len < LOG_BATCH_HEADER_LEN { + assert_eq!(len, 0); + let expected = "Log batch header too short: 0"; + assert!(matches!( + LogBatch::decode_header(&mut &*encoded), + Err(Error::Corruption(m)) if m == expected + )); + return; + } + + let item_batch = batch.item_batch.clone(); + // decode item batch + let mut bytes_slice = encoded; + let (offset, compression_type, len) = + LogBatch::decode_header(&mut bytes_slice).unwrap(); + assert_eq!(len, encoded.len()); + assert_eq!(bytes_slice.len() + LOG_BATCH_HEADER_LEN, encoded.len()); + let mut entries_handle = mocked_file_block_handle; + entries_handle.offset = LOG_BATCH_HEADER_LEN as u64; + entries_handle.len = offset - LOG_BATCH_HEADER_LEN; + let file_context = LogFileContext::new(entries_handle.id, version); + { + // Decoding with wrong compression type is okay. + LogItemBatch::decode( + &mut &encoded[offset..], + entries_handle, + if compression_type == CompressionType::None { + CompressionType::Lz4 + } else { + CompressionType::None + }, + &file_context, + ) + .unwrap(); + // Decode with wrong file number. + if version.has_log_signing() { + LogItemBatch::decode( + &mut &encoded[offset..], + entries_handle, + compression_type, + &LogFileContext::new(FileId::new(LogQueue::Append, u64::MAX), version), + ) + .unwrap_err(); + } + // Decode with wrong version. + LogItemBatch::decode( + &mut &encoded[offset..], + entries_handle, + compression_type, + &LogFileContext::new( + file_context.id, + if version == Version::V1 { + Version::V2 + } else { + Version::V1 + }, + ), + ) + .unwrap_err(); + } + let decoded_item_batch = LogItemBatch::decode( + &mut &encoded[offset..], + entries_handle, + compression_type, + &file_context, + ) + .unwrap(); + assert_eq!(decoded_item_batch, item_batch); + assert!(decoded_item_batch.approximate_size() >= len - offset); + + let entries = &encoded[LOG_BATCH_HEADER_LEN..offset]; + for item in decoded_item_batch.items.iter() { + if let LogItemContent::EntryIndexes(entry_indexes) = &item.content { + if !entry_indexes.0.is_empty() { + let (begin, end) = ( + entry_indexes.0.first().unwrap().index, + entry_indexes.0.last().unwrap().index + 1, + ); + let origin_entries = generate_entries(begin, end, Some(entry_data)); + let decoded_entries = + decode_entries_from_bytes::(entries, &entry_indexes.0, false); + assert_eq!(origin_entries, decoded_entries); + } + } + } + } + + let mut batches = vec![(LogBatch::default(), Vec::new())]; + let mut batch = LogBatch::default(); + let entry_data = vec![b'x'; 1024]; + batch + .add_entries::(7, &generate_entries(1, 11, Some(&entry_data))) + .unwrap(); + batch.add_command(7, Command::Clean); + batch.put(7, b"key".to_vec(), b"value".to_vec()).unwrap(); + batch.delete(7, b"key2".to_vec()); + batch + .add_entries::(7, &generate_entries(1, 11, Some(&entry_data))) + .unwrap(); + batches.push((batch, entry_data)); + let mut batch = LogBatch::default(); + batch + .add_entries::(17, &generate_entries(0, 1, None)) + .unwrap(); + batch + .add_entries::(27, &generate_entries(1, 11, None)) + .unwrap(); + batches.push((batch, Vec::new())); + + // Validate with different Versions + for version in Version::iter() { + for compress in [true, false] { + for (batch, entry_data) in batches.clone().into_iter() { + decode_and_encode(batch, compress, version, &entry_data); + } + } + } + } + + #[test] + fn test_log_batch_merge() { + let region_id = 8; + let mut entries = Vec::new(); + let mut kvs = Vec::new(); + let data = vec![b'x'; 1024]; + let file_id = FileId::dummy(LogQueue::Append); + let file_context = LogFileContext::new(file_id, Version::default()); + + let mut batch1 = LogBatch::default(); + entries.push(generate_entries(1, 11, Some(&data))); + batch1 + .add_entries::(region_id, entries.last().unwrap()) + .unwrap(); + for i in 0..=2 { + let (k, v) = (format!("k{i}").into_bytes(), format!("v{i}").into_bytes()); + batch1.put(region_id, k.clone(), v.clone()).unwrap(); + kvs.push((k, v)); + } + + batch1.merge(&mut LogBatch::default()).unwrap(); + + let mut batch2 = LogBatch::default(); + entries.push(generate_entries(11, 21, Some(&data))); + batch2 + .add_entries::(region_id, entries.last().unwrap()) + .unwrap(); + for i in 3..=5 { + let (k, v) = (format!("k{i}").into_bytes(), format!("v{i}").into_bytes()); + batch2.put(region_id, k.clone(), v.clone()).unwrap(); + kvs.push((k, v)); + } + + batch1.merge(&mut batch2).unwrap(); + assert!(batch2.is_empty()); + + let (len, _) = batch1.finish_populate(0, None).unwrap(); + batch1.prepare_write(&file_context).unwrap(); + let encoded = batch1.encoded_bytes(); + assert_eq!(len, encoded.len()); + + // decode item batch + let (offset, compression_type, len) = LogBatch::decode_header(&mut &*encoded).unwrap(); + assert_eq!(encoded.len(), len); + let decoded_item_batch = LogItemBatch::decode( + &mut &encoded[offset..], + FileBlockHandle { + id: file_id, + offset: 0, + len: offset - LOG_BATCH_HEADER_LEN, + }, + compression_type, + &file_context, + ) + .unwrap(); + + // decode and assert entries + let entry_bytes = &encoded[LOG_BATCH_HEADER_LEN..offset]; + for item in decoded_item_batch.items.iter() { + match &item.content { + LogItemContent::EntryIndexes(entry_indexes) => { + let decoded_entries = + decode_entries_from_bytes::(entry_bytes, &entry_indexes.0, false); + assert_eq!(entries.remove(0), decoded_entries); + } + LogItemContent::Kv(kv) => { + let (k, v) = kvs.remove(0); + assert_eq!(OpType::Put, kv.op_type); + assert_eq!(k, kv.key); + assert_eq!(&v, kv.value.as_ref().unwrap()); + } + _ => unreachable!(), + } + } + } + + #[test] + fn test_empty_log_batch() { + let mut batch = LogBatch::default(); + assert!(batch.is_empty()); + batch.add_entries::(0, &Vec::new()).unwrap(); + assert!(batch.is_empty()); + batch.add_raw_entries(0, Vec::new(), Vec::new()).unwrap(); + assert!(batch.is_empty()); + // Encoding empty LogBatch. + { + let mocked_file_block_handles = FileBlockHandle { + id: FileId::new(LogQueue::Append, 12), + len: 0, + offset: 0, + }; + let buf_len = batch.buf.len(); + let (len, compression_ratio) = batch.finish_populate(1, None).unwrap(); + assert!(compression_ratio == 0.0); + assert!(len == 0); + assert_eq!(batch.buf_state, BufState::Encoded(buf_len, 0)); + let file_context = LogFileContext::new(mocked_file_block_handles.id, Version::V2); + batch.prepare_write(&file_context).unwrap(); + assert!(batch.is_empty()); + assert_eq!(batch.buf_state, BufState::Sealed(buf_len, 0)); + } + } + + #[test] + fn test_internal_key() { + let mut batch = LogBatch::default(); + assert!(matches!( + batch + .put(0, crate::make_internal_key(&[0]), b"v".to_vec()) + .unwrap_err(), + Error::InvalidArgument(_) + )); + assert!(matches!( + batch + .put_message( + 0, + crate::make_internal_key(ATOMIC_GROUP_KEY), + &Entry::default() + ) + .unwrap_err(), + Error::InvalidArgument(_) + )); + } + + #[test] + fn test_header_corruption() { + let region_id = 7; + let data = vec![b'x'; 16]; + let mut batch = LogBatch::default(); + batch + .add_entries::(region_id, &generate_entries(1, 11, Some(&data))) + .unwrap(); + batch + .put(region_id, b"key".to_vec(), b"value".to_vec()) + .unwrap(); + // enable compression so that len_and_type > len. + batch.finish_populate(1, None).unwrap(); + let file_context = LogFileContext::new(FileId::dummy(LogQueue::Append), Version::default()); + batch.prepare_write(&file_context).unwrap(); + let encoded = batch.encoded_bytes(); + + let mut copy = encoded.to_owned(); + copy.truncate(LOG_BATCH_HEADER_LEN - 1); + assert!(LogBatch::decode_header(&mut copy.as_slice()) + .unwrap_err() + .to_string() + .contains("Log batch header too short")); + + let mut copy = encoded.to_owned(); + (&mut copy[LOG_BATCH_HEADER_LEN - 8..LOG_BATCH_HEADER_LEN]) + .write_u64::(encoded.len() as u64 + 1) + .unwrap(); + assert!(LogBatch::decode_header(&mut copy.as_slice()) + .unwrap_err() + .to_string() + .contains("Log item offset exceeds log batch length")); + + let mut copy = encoded.to_owned(); + (&mut copy[LOG_BATCH_HEADER_LEN - 8..LOG_BATCH_HEADER_LEN]) + .write_u64::(LOG_BATCH_HEADER_LEN as u64 - 1) + .unwrap(); + assert!(LogBatch::decode_header(&mut copy.as_slice()) + .unwrap_err() + .to_string() + .contains("Log item offset is smaller than log batch header length")); + } + + #[cfg(feature = "nightly")] + #[bench] + fn bench_log_batch_add_entry_and_encode(b: &mut test::Bencher) { + use rand::{thread_rng, Rng}; + fn details(log_batch: &mut LogBatch, entries: &[Entry], regions: usize) { + for _ in 0..regions { + log_batch + .add_entries::(thread_rng().gen(), entries) + .unwrap(); + } + log_batch.finish_populate(0, None).unwrap(); + let _ = log_batch.drain(); + } + let data: Vec = (0..128).map(|_| thread_rng().gen()).collect(); + let entries = generate_entries(1, 11, Some(&data)); + let mut log_batch = LogBatch::default(); + // warm up + details(&mut log_batch, &entries, 100); + b.iter(move || { + details(&mut log_batch, &entries, 100); + }); + } + + #[test] + fn test_log_batch_sign_signature_repeatedly() { + // Set a LogBatch and encode the LogBatch by `finish_populate`. + let mut batch = LogBatch::default(); + batch + .add_entries::(17, &generate_entries(0, 1, None)) + .unwrap(); + batch + .add_entries::(27, &generate_entries(1, 11, None)) + .unwrap(); + + let mocked_file_block_handles = [ + FileBlockHandle { + id: FileId::new(LogQueue::Append, 12), + len: 0, + offset: 0, + }, + FileBlockHandle { + id: FileId::new(LogQueue::Append, 18), + len: 0, + offset: 0, + }, + FileBlockHandle { + id: FileId::new(LogQueue::Append, 2001), + len: 0, + offset: 0, + }, + ]; + let old_approximate_size = batch.approximate_size(); + let (len, compression_ratio) = batch.finish_populate(1, None).unwrap(); + assert!(compression_ratio > 0.0); + assert!(old_approximate_size >= len); + assert_eq!(batch.approximate_size(), len); + let checksum = batch.item_batch.checksum; + + // Repeatedly sign signature to this batch, followed by decoding the signature + // and verifying the checksum. + for handle in mocked_file_block_handles { + let mut batch_handle = handle; + batch_handle.len = len; + let file_context = LogFileContext::new(batch_handle.id, Version::V2); + batch.prepare_write(&file_context).unwrap(); + assert_eq!(batch.approximate_size(), len); + let encoded = batch.encoded_bytes(); + assert_eq!(encoded.len(), len); + let mut bytes_slice = encoded; + let (offset, _, _) = LogBatch::decode_header(&mut bytes_slice).unwrap(); + let expected = + verify_checksum_with_signature(&encoded[offset..], file_context.get_signature()) + .unwrap(); + assert_eq!(expected, checksum); + } + } +} diff --git a/third/raft-engine/src/memtable.rs b/third/raft-engine/src/memtable.rs new file mode 100644 index 00000000..7a0ea41b --- /dev/null +++ b/third/raft-engine/src/memtable.rs @@ -0,0 +1,2538 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use std::borrow::BorrowMut; +use std::collections::{BTreeMap, HashSet, VecDeque}; +use std::marker::PhantomData; +use std::ops::Bound; +use std::sync::Arc; + +use fail::fail_point; +use hashbrown::HashMap; +use log::{error, warn}; +use parking_lot::{Mutex, RwLock}; + +use crate::config::Config; +use crate::file_pipe_log::ReplayMachine; +use crate::log_batch::{ + AtomicGroupStatus, Command, CompressionType, KeyValue, LogBatch, LogItem, LogItemBatch, + LogItemContent, OpType, +}; +use crate::metrics::MEMORY_USAGE; +use crate::pipe_log::{FileBlockHandle, FileId, FileSeq, LogQueue}; +use crate::util::{hash_u64, Factory}; +use crate::{Error, GlobalStats, Result}; + +#[cfg(feature = "swap")] +mod swap_conditional_imports { + use crate::swappy_allocator::SwappyAllocator; + use std::convert::TryFrom; + use std::path::Path; + + pub trait AllocatorTrait: std::alloc::Allocator + Clone + Send + Sync {} + impl AllocatorTrait for T {} + + pub type VacantAllocator = std::alloc::Global; + pub type SelectedAllocator = SwappyAllocator; + + pub fn new_vacant_allocator() -> VacantAllocator { + std::alloc::Global + } + pub fn new_allocator(cfg: &crate::Config) -> SelectedAllocator { + let memory_limit = + usize::try_from(cfg.memory_limit.map_or(u64::MAX, |l| l.0)).unwrap_or(usize::MAX); + let path = Path::new(&cfg.dir).join("swap"); + SwappyAllocator::new(&path, memory_limit) + } +} + +#[cfg(not(feature = "swap"))] +mod swap_conditional_imports { + pub trait AllocatorTrait: Clone + Send + Sync {} + + #[derive(Clone)] + pub struct DummyAllocator; + impl AllocatorTrait for DummyAllocator {} + + pub type VacantAllocator = DummyAllocator; + pub type SelectedAllocator = DummyAllocator; + + pub fn new_vacant_allocator() -> VacantAllocator { + DummyAllocator + } + pub fn new_allocator(_: &crate::Config) -> SelectedAllocator { + DummyAllocator + } +} + +use swap_conditional_imports::*; + +/// Attempt to shrink entry container if its capacity reaches the threshold. +const CAPACITY_SHRINK_THRESHOLD: usize = 1024 - 1; +const CAPACITY_INIT: usize = 32 - 1; +/// Number of hash table to store [`MemTable`]. +const MEMTABLE_SLOT_COUNT: usize = 128; + +/// Location of a log entry. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct EntryIndex { + /// Logical index. + pub index: u64, + + /// File location of the group of entries that this entry belongs to. + pub entries: Option, + // How its group of entries is compacted. + pub compression_type: CompressionType, + + /// The relative offset within its group of entries. + pub entry_offset: u32, + /// The encoded length within its group of entries. + pub entry_len: u32, +} + +impl Default for EntryIndex { + fn default() -> EntryIndex { + EntryIndex { + index: 0, + entries: None, + compression_type: CompressionType::None, + entry_offset: 0, + entry_len: 0, + } + } +} + +impl EntryIndex { + fn from_thin(index: u64, e: ThinEntryIndex) -> Self { + Self { + index, + entries: e.entries, + compression_type: e.compression_type, + entry_offset: e.entry_offset, + entry_len: e.entry_len, + } + } +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +struct ThinEntryIndex { + entries: Option, + compression_type: CompressionType, + entry_offset: u32, + entry_len: u32, +} + +impl From<&EntryIndex> for ThinEntryIndex { + fn from(e: &EntryIndex) -> Self { + Self { + entries: e.entries, + compression_type: e.compression_type, + entry_offset: e.entry_offset, + entry_len: e.entry_len, + } + } +} + +/// In-memory storage for Raft Groups. +/// +/// Each Raft Group has its own `MemTable` to store all key value pairs and the +/// file locations of all log entries. +pub struct MemTable { + /// The ID of current Raft Group. + region_id: u64, + + /// Container of entries. Incoming entries are pushed to the back with + /// ascending log indexes. + #[cfg(feature = "swap")] + entry_indexes: VecDeque, + #[cfg(not(feature = "swap"))] + entry_indexes: VecDeque, + /// The log index of the first entry. + first_index: u64, + /// The amount of rewritten entries. Rewritten entries are the oldest + /// entries and stored at the front of the container. + rewrite_count: usize, + + /// A map of active key value pairs. + kvs: BTreeMap, (Vec, FileId)>, + + /// (start_seq, end_seq). + /// If there's an active entry stored before end_seq, it possibly belongs to + /// an atomic group. In order to not lose this entry, We cannot delete any + /// other entries in that group. + /// Only applies to Rewrite queue. Each Raft Group has at most one atomic + /// group at a time, because we only use atomic group for rewrite-rewrite + /// operation, a group always contains all the Rewrite entries in a Raft + /// Group. + atomic_group: Option<(FileSeq, FileSeq)>, + + /// Shared statistics. + global_stats: Arc, + + _phantom: PhantomData, +} + +impl MemTable { + #[allow(dead_code)] + fn new(region_id: u64, global_stats: Arc) -> MemTable { + Self::with_allocator(region_id, global_stats, &new_vacant_allocator()) + } +} + +impl MemTable { + fn with_allocator( + region_id: u64, + global_stats: Arc, + _allocator: &A, + ) -> MemTable { + MemTable { + region_id, + #[cfg(feature = "swap")] + entry_indexes: VecDeque::with_capacity_in(CAPACITY_INIT, _allocator.clone()), + #[cfg(not(feature = "swap"))] + entry_indexes: VecDeque::with_capacity(CAPACITY_INIT), + first_index: 0, + rewrite_count: 0, + kvs: BTreeMap::default(), + atomic_group: None, + global_stats, + _phantom: PhantomData, + } + } + + /// Merges with a newer neighbor [`MemTable`]. + /// + /// This method is only used for recovery. + pub fn merge_newer_neighbor(&mut self, rhs: &mut Self) { + debug_assert_eq!(self.region_id, rhs.region_id); + if let Some((rhs_first, _)) = rhs.span() { + self.prepare_append( + rhs_first, + // Rewrite -> Compact Append -> Rewrite. + // TODO: add test case. + rhs.rewrite_count > 0, /* allow_hole */ + // Always true, because `self` might not have all entries in + // history. + true, /* allow_overwrite */ + ); + self.global_stats.add( + rhs.entry_indexes[0].entries.unwrap().id.queue, + rhs.entry_indexes.len(), + ); + self.rewrite_count += rhs.rewrite_count; + self.entry_indexes.append(&mut rhs.entry_indexes); + rhs.rewrite_count = 0; + } + + for (key, (value, file_id)) in rhs.kvs.iter() { + self.put(key.clone(), value.clone(), *file_id); + } + + if let Some(g) = rhs.atomic_group.take() { + assert!(self.atomic_group.map_or(true, |(_, end)| end <= g.0)); + self.atomic_group = Some(g); + } + + let deleted = rhs.global_stats.deleted_rewrite_entries(); + self.global_stats.add(LogQueue::Rewrite, deleted); + self.global_stats.delete(LogQueue::Rewrite, deleted); + } + + /// Merges with a [`MemTable`] that contains only append data. Assumes + /// `self` contains all rewritten data of the same region. + /// + /// This method is only used for recovery. + pub fn merge_append_table(&mut self, rhs: &mut Self) { + debug_assert_eq!(self.region_id, rhs.region_id); + debug_assert_eq!(self.rewrite_count, self.entry_indexes.len()); + debug_assert_eq!(rhs.rewrite_count, 0); + + if let Some((first, _)) = rhs.span() { + self.prepare_append( + first, + // FIXME: It's possibly okay to set it to false. Any compact + // command applied to append queue will also be applied to + // rewrite queue. + true, /* allow_hole */ + // Compact -> Rewrite -> Data loss of the compact command. + true, /* allow_overwrite */ + ); + self.global_stats.add( + rhs.entry_indexes[0].entries.unwrap().id.queue, + rhs.entry_indexes.len(), + ); + self.entry_indexes.append(&mut rhs.entry_indexes); + } + + for (key, (value, file_id)) in rhs.kvs.iter() { + self.put(key.clone(), value.clone(), *file_id); + } + + assert!(rhs.atomic_group.is_none()); + + let deleted = rhs.global_stats.deleted_rewrite_entries(); + self.global_stats.add(LogQueue::Rewrite, deleted); + self.global_stats.delete(LogQueue::Rewrite, deleted); + } + + /// Returns value for a given key. + pub fn get(&self, key: &[u8]) -> Option> { + self.kvs.get(key).map(|v| v.0.clone()) + } + + /// Iterates over [start_key, end_key) range and yields all key value pairs + /// as bytes. + pub fn scan( + &self, + start_key: Option<&[u8]>, + end_key: Option<&[u8]>, + reverse: bool, + mut f: F, + ) -> Result<()> + where + F: FnMut(&[u8], &[u8]) -> bool, + { + let lower = start_key.map(Bound::Included).unwrap_or(Bound::Unbounded); + let upper = end_key.map(Bound::Excluded).unwrap_or(Bound::Unbounded); + let iter = self.kvs.range::<[u8], _>((lower, upper)); + if reverse { + for (key, (value, _)) in iter.rev() { + if !f(key, value) { + break; + } + } + } else { + for (key, (value, _)) in iter { + if !f(key, value) { + break; + } + } + } + Ok(()) + } + + /// Deletes a key value pair. + pub fn delete(&mut self, key: &[u8]) { + if let Some(value) = self.kvs.remove(key) { + self.global_stats.delete(value.1.queue, 1); + } + } + + /// Puts a key value pair that has been written to the specified file. The + /// old value for this key will be deleted if exists. + pub fn put(&mut self, key: Vec, value: Vec, file_id: FileId) { + if let Some(origin) = self.kvs.insert(key, (value, file_id)) { + self.global_stats.delete(origin.1.queue, 1); + } + self.global_stats.add(file_id.queue, 1); + } + + /// Rewrites a key by marking its location to the `seq`-th log file in + /// rewrite queue. No-op if the key does not exist. + /// + /// When `gate` is present, only append data no newer than it will be + /// rewritten. + pub fn rewrite_key(&mut self, key: Vec, gate: Option, seq: FileSeq) { + self.global_stats.add(LogQueue::Rewrite, 1); + if let Some(origin) = self.kvs.get_mut(&key) { + if origin.1.queue == LogQueue::Append { + if let Some(gate) = gate { + if origin.1.seq <= gate { + origin.1 = FileId { + queue: LogQueue::Rewrite, + seq, + }; + self.global_stats.delete(LogQueue::Append, 1); + return; + } + } + } else { + assert!(origin.1.seq <= seq); + origin.1.seq = seq; + } + } + self.global_stats.delete(LogQueue::Rewrite, 1); + } + + /// Returns the log entry location for a given logical log index. + pub fn get_entry(&self, index: u64) -> Option { + if let Some((first, last)) = self.span() { + if index < first || index > last { + return None; + } + + let ioffset = (index - first) as usize; + let entry_index = self.entry_indexes[ioffset]; + Some(EntryIndex::from_thin(index, entry_index)) + } else { + None + } + } + + /// Appends some log entries from append queue. Existing entries newer than + /// any of the incoming entries will be deleted silently. Assumes the + /// provided entries have consecutive logical indexes. + /// + /// # Panics + /// + /// Panics if index of the first entry in `entry_indexes` is greater than + /// largest existing index + 1 (hole). + /// + /// Panics if incoming entries contains indexes that might be compacted + /// before (overwrite history). + pub fn append(&mut self, entry_indexes: Vec) { + let len = entry_indexes.len(); + if len > 0 { + self.prepare_append( + entry_indexes[0].index, + false, /* allow_hole */ + false, /* allow_overwrite */ + ); + self.global_stats.add(LogQueue::Append, len); + for ei in &entry_indexes { + self.entry_indexes.push_back(ei.into()); + } + } + } + + /// Appends some entries from append queue. Assumes this table has no + /// rewrite data. + /// + /// This method is only used for recovery. + pub fn replay_append(&mut self, entry_indexes: Vec) { + let len = entry_indexes.len(); + if len > 0 { + debug_assert_eq!(self.rewrite_count, 0); + self.prepare_append( + entry_indexes[0].index, + false, /* allow_hole */ + // Refer to case in `merge_newer_neighbor`. + true, /* allow_overwrite */ + ); + self.global_stats.add(LogQueue::Append, len); + for ei in &entry_indexes { + debug_assert_eq!(ei.entries.unwrap().id.queue, LogQueue::Append); + self.entry_indexes.push_back(ei.into()); + } + } + } + + /// Rewrites some entries by modifying their location. + /// + /// When `gate` is present, only append data no newer than it will be + /// rewritten. + /// + /// # Panics + /// + /// Panics if index of the first entry in `rewrite_indexes` is greater than + /// largest existing rewritten index + 1 (hole). + pub fn rewrite(&mut self, rewrite_indexes: Vec, gate: Option) { + if rewrite_indexes.is_empty() { + return; + } + self.global_stats + .add(LogQueue::Rewrite, rewrite_indexes.len()); + + let len = self.entry_indexes.len(); + if len == 0 { + self.global_stats + .delete(LogQueue::Rewrite, rewrite_indexes.len()); + return; + } + + let first = self.first_index; + let last = self.first_index + len as u64 - 1; + let rewrite_first = std::cmp::max(rewrite_indexes[0].index, first); + let rewrite_last = std::cmp::min(rewrite_indexes[rewrite_indexes.len() - 1].index, last); + let mut rewrite_len = (rewrite_last + 1).saturating_sub(rewrite_first) as usize; + if rewrite_len == 0 { + self.global_stats + .delete(LogQueue::Rewrite, rewrite_indexes.len()); + return; + } + + let pos = (rewrite_first - first) as usize; + // No normal log entry mixed in rewritten entries at the front. + assert!( + pos == 0 || self.entry_indexes[pos - 1].entries.unwrap().id.queue == LogQueue::Rewrite + ); + let rewrite_pos = (rewrite_first - rewrite_indexes[0].index) as usize; + + for (i, rindex) in rewrite_indexes[rewrite_pos..rewrite_pos + rewrite_len] + .iter() + .enumerate() + { + let index = &mut self.entry_indexes[i + pos]; + if let Some(gate) = gate { + debug_assert_eq!(index.entries.unwrap().id.queue, LogQueue::Append); + if index.entries.unwrap().id.seq > gate { + // Some entries are overwritten by new appends. + rewrite_len = i; + break; + } + } else if index.entries.unwrap().id.queue == LogQueue::Append { + // Squeeze operation encounters a new append. + rewrite_len = i; + break; + } + + *index = rindex.into(); + } + + if gate.is_none() { + // We either replaced some old rewrite entries, or some incoming entries are + // discarded. + self.global_stats + .delete(LogQueue::Rewrite, rewrite_indexes.len()); + // rewrite-rewrite could partially renew rewrite entries due to batch splitting. + self.rewrite_count = std::cmp::max(self.rewrite_count, pos + rewrite_len); + } else { + self.global_stats.delete(LogQueue::Append, rewrite_len); + self.global_stats + .delete(LogQueue::Rewrite, rewrite_indexes.len() - rewrite_len); + // rewrite-append always push forward. + assert!(pos + rewrite_len >= self.rewrite_count); + self.rewrite_count = pos + rewrite_len; + } + } + + /// Appends some entries from rewrite queue. Assumes this table has no + /// append data. + /// + /// This method is only used for recovery. + pub fn replay_rewrite(&mut self, entry_indexes: Vec) { + let len = entry_indexes.len(); + if len > 0 { + debug_assert_eq!(self.rewrite_count, self.entry_indexes.len()); + self.prepare_append( + entry_indexes[0].index, + // Rewrite -> Compact Append -> Rewrite. + true, /* allow_hole */ + // Refer to case in `merge_append_table`. They can be adapted + // to attack this path via a global rewrite without deleting + // obsolete rewrite files. + true, /* allow_overwrite */ + ); + self.global_stats.add(LogQueue::Rewrite, len); + for ei in &entry_indexes { + self.entry_indexes.push_back(ei.into()); + } + self.rewrite_count = self.entry_indexes.len(); + } + } + + /// Removes all entries with index smaller than `index`. Returns the number + /// of deleted entries. + pub fn compact_to(&mut self, index: u64) -> u64 { + if self.entry_indexes.is_empty() { + return 0; + } + let first = self.first_index; + if index <= first { + return 0; + } + let count = std::cmp::min((index - first) as usize, self.entry_indexes.len()); + self.first_index = index; + self.entry_indexes.drain(..count); + self.maybe_shrink_entry_indexes(); + + let compacted_rewrite = std::cmp::min(count, self.rewrite_count); + self.rewrite_count -= compacted_rewrite; + self.global_stats + .delete(LogQueue::Rewrite, compacted_rewrite); + self.global_stats + .delete(LogQueue::Append, count - compacted_rewrite); + count as u64 + } + + pub fn apply_rewrite_atomic_group(&mut self, start: FileSeq, end: FileSeq) { + assert!(self.atomic_group.map_or(true, |(_, b)| b <= start)); + self.atomic_group = Some((start, end)); + } + + /// Removes all entry indexes with index greater than or equal to `index`. + /// Assumes `index` <= `last`. + /// + /// Returns the number of deleted entries. + fn unsafe_truncate_back(&mut self, first: u64, index: u64, last: u64) -> usize { + debug_assert!(index <= last); + let len = self.entry_indexes.len(); + debug_assert_eq!(len as u64, last - first + 1); + self.entry_indexes + .truncate(index.saturating_sub(first) as usize); + let new_len = self.entry_indexes.len(); + let truncated = len - new_len; + + if self.rewrite_count > new_len { + let truncated_rewrite = self.rewrite_count - new_len; + self.rewrite_count = new_len; + self.global_stats + .delete(LogQueue::Rewrite, truncated_rewrite); + self.global_stats + .delete(LogQueue::Append, truncated - truncated_rewrite); + } else { + self.global_stats.delete(LogQueue::Append, truncated); + } + truncated + } + + /// Prepares to append entries with indexes starting at + /// `first_index_to_add`. After preparation, those entries can be directly + /// appended to internal container. + /// + /// When `allow_hole` is set, existing entries will be removes if there is a + /// hole detected. Otherwise, panic. + /// + /// When `allow_overwrite_compacted` is set, existing entries will be + /// removes if incoming entries attempt to overwrite compacted slots. + /// Otherwise, panic. + #[inline] + fn prepare_append( + &mut self, + first_index_to_add: u64, + allow_hole: bool, + allow_overwrite_compacted: bool, + ) { + if let Some((first, last)) = self.span() { + if first_index_to_add < first { + if allow_overwrite_compacted { + self.unsafe_truncate_back(first, 0, last); + } else { + panic!( + "attempt to overwrite compacted entries in {}", + self.region_id + ); + } + self.first_index = first_index_to_add; + } else if last + 1 < first_index_to_add { + if allow_hole { + self.unsafe_truncate_back(first, 0, last); + } else { + panic!("memtable {} has a hole", self.region_id); + } + self.first_index = first_index_to_add; + } else if first_index_to_add != last + 1 { + self.unsafe_truncate_back(first, first_index_to_add, last); + } + } else { + self.first_index = first_index_to_add; + } + } + + #[inline] + fn maybe_shrink_entry_indexes(&mut self) { + if self.entry_indexes.capacity() >= CAPACITY_SHRINK_THRESHOLD { + self.entry_indexes.shrink_to_fit(); + } + } + + /// Pulls all entries between log index `begin` and `end` to the given + /// buffer. Returns error if any entry is missing. + /// + /// When `max_size` is present, stops pulling entries when the total size + /// reaches it. + pub fn fetch_entries_to( + &self, + begin: u64, + end: u64, + max_size: Option, + vec_idx: &mut Vec, + ) -> Result<()> { + if end <= begin { + return Ok(()); + } + let len = self.entry_indexes.len(); + if len == 0 { + return Err(Error::EntryNotFound); + } + let first = self.first_index; + if begin < first { + return Err(Error::EntryCompacted); + } + if end > self.first_index + len as u64 { + return Err(Error::EntryNotFound); + } + + let start_pos = (begin - first) as usize; + let end_pos = (end - begin) as usize + start_pos; + + let mut total_size = 0; + let mut index = begin; + for idx in self.entry_indexes.range(start_pos..end_pos) { + total_size += idx.entry_len; + // No matter max_size's value, fetch one entry at least. + if let Some(max_size) = max_size { + if total_size as usize > max_size && total_size > idx.entry_len { + break; + } + } + vec_idx.push(EntryIndex::from_thin(index, *idx)); + index += 1; + } + Ok(()) + } + + /// Pulls all append entries older than or equal to `gate`, to the provided + /// buffer. + pub fn fetch_entry_indexes_before( + &self, + gate: FileSeq, + vec_idx: &mut Vec, + ) -> Result<()> { + if let Some((first, last)) = self.span() { + let mut i = self.rewrite_count; + while first + i as u64 <= last && self.entry_indexes[i].entries.unwrap().id.seq <= gate + { + vec_idx.push(EntryIndex::from_thin( + first + i as u64, + self.entry_indexes[i], + )); + i += 1; + } + } + Ok(()) + } + + /// Pulls all rewrite entries to the provided buffer. + pub fn fetch_rewritten_entry_indexes(&self, vec_idx: &mut Vec) -> Result<()> { + if self.rewrite_count > 0 { + let first = self.first_index; + let end = self.first_index + self.rewrite_count as u64; + self.fetch_entries_to(first, end, None, vec_idx) + } else { + Ok(()) + } + } + + /// Pulls all key value pairs older than or equal to `gate`, to the provided + /// buffer. + pub fn fetch_kvs_before(&self, gate: FileSeq, vec: &mut Vec<(Vec, Vec)>) { + for (key, (value, file_id)) in &self.kvs { + if file_id.queue == LogQueue::Append && file_id.seq <= gate { + vec.push((key.clone(), value.clone())); + } + } + } + + /// Pulls all rewrite key value pairs to the provided buffer. + pub fn fetch_rewritten_kvs(&self, vec: &mut Vec<(Vec, Vec)>) { + for (key, (value, file_id)) in &self.kvs { + if file_id.queue == LogQueue::Rewrite { + vec.push((key.clone(), value.clone())); + } + } + } + + /// Returns the smallest file sequence number of entries or key value pairs + /// in this table. + pub fn min_file_seq(&self, queue: LogQueue) -> Option { + let entry = match queue { + LogQueue::Append => self.entry_indexes.get(self.rewrite_count), + LogQueue::Rewrite if self.rewrite_count == 0 => None, + LogQueue::Rewrite => self.entry_indexes.front(), + }; + let ents_min = entry.map(|e| e.entries.unwrap().id.seq); + let kvs_min = self + .kvs + .values() + .filter(|v| v.1.queue == queue) + .fold(None, |min, v| { + if let Some(min) = min { + Some(std::cmp::min(min, v.1.seq)) + } else { + Some(v.1.seq) + } + }); + let res = match (ents_min, kvs_min) { + (Some(ents_min), Some(kvs_min)) => std::cmp::min(kvs_min, ents_min), + (Some(ents_min), None) => ents_min, + (None, Some(kvs_min)) => kvs_min, + (None, None) => return None, + }; + if queue == LogQueue::Rewrite { + if let Some((start, end)) = self.atomic_group { + if res <= end { + return Some(std::cmp::min(start, res)); + } + } + } + Some(res) + } + + #[inline] + pub fn has_at_least_some_entries_before(&self, gate: FileId, count: usize) -> bool { + debug_assert!(count > 0); + self.entry_indexes + .get(count - 1) + .map_or(false, |ei| ei.entries.unwrap().id.seq <= gate.seq) + } + + /// Returns the region ID. + pub fn region_id(&self) -> u64 { + self.region_id + } + + pub(crate) fn rewrite_count(&self) -> usize { + self.rewrite_count + } + + /// Returns the log index of the first log entry. + pub fn first_index(&self) -> Option { + self.span().map(|s| s.0) + } + + /// Returns the log index of the last log entry. + pub fn last_index(&self) -> Option { + self.span().map(|s| s.1) + } + + #[allow(dead_code)] + fn heap_size(&self) -> usize { + // FIXME: cover the map of kvs. + self.entry_indexes.capacity() * std::mem::size_of::() + } + + /// Returns the first and last log index of the entries in this table. + #[inline] + fn span(&self) -> Option<(u64, u64)> { + let len = self.entry_indexes.len(); + if len > 0 { + Some((self.first_index, self.first_index + len as u64 - 1)) + } else { + None + } + } + + #[cfg(test)] + fn consistency_check(&self) { + let mut seen_append = false; + for idx in self.entry_indexes.iter() { + // rewrites are at the front. + let queue = idx.entries.unwrap().id.queue; + if queue == LogQueue::Append { + seen_append = true; + } + assert_eq!( + queue, + if seen_append { + LogQueue::Append + } else { + LogQueue::Rewrite + } + ); + } + } +} + +impl Drop for MemTable { + fn drop(&mut self) { + let mut append_kvs = 0; + let mut rewrite_kvs = 0; + for (_v, id) in self.kvs.values() { + match id.queue { + LogQueue::Rewrite => rewrite_kvs += 1, + LogQueue::Append => append_kvs += 1, + } + } + + self.global_stats + .delete(LogQueue::Rewrite, self.rewrite_count + rewrite_kvs); + self.global_stats.delete( + LogQueue::Append, + self.entry_indexes.len() - self.rewrite_count + append_kvs, + ); + } +} + +type MemTableMap = HashMap>>>; +pub type MemTableHandle = Arc>>; +pub type MemTables = MemTableAccessor; + +/// A collection of [`MemTable`]s. +/// +/// Internally, they are stored in multiple [`HashMap`]s, which are indexed by +/// hashed region IDs. +#[derive(Clone)] +pub struct MemTableAccessor { + global_stats: Arc, + allocator: A, + + /// A fixed-size array of maps of [`MemTable`]s. + slots: Vec>>>, + /// Deleted [`MemTable`]s that are not yet rewritten. + removed_memtables: Arc>>, +} + +impl MemTableAccessor { + pub fn new(global_stats: Arc) -> MemTableAccessor { + let mut slots = Vec::with_capacity(MEMTABLE_SLOT_COUNT); + for _ in 0..MEMTABLE_SLOT_COUNT { + slots.push(Arc::new(RwLock::new(MemTableMap::default()))); + } + MemTableAccessor { + global_stats, + allocator: new_vacant_allocator(), + slots, + removed_memtables: Default::default(), + } + } +} + +impl MemTableAccessor { + pub fn memory_usage(&self) -> usize { + #[cfg(not(feature = "swap"))] + { + let mut total = 0; + for tables in &self.slots { + tables.read().values().for_each(|t| { + total += t.read().heap_size(); + }); + } + total + } + #[cfg(feature = "swap")] + { + self.allocator.memory_usage() + } + } + + pub(crate) fn flush_metrics(&self) { + MEMORY_USAGE.set(self.memory_usage() as i64); + } +} + +impl MemTableAccessor { + pub fn new_with_allocator(global_stats: Arc, allocator: A) -> MemTableAccessor { + let mut slots = Vec::with_capacity(MEMTABLE_SLOT_COUNT); + for _ in 0..MEMTABLE_SLOT_COUNT { + slots.push(Arc::new(RwLock::new(MemTableMap::default()))); + } + MemTableAccessor { + global_stats, + allocator, + slots, + removed_memtables: Default::default(), + } + } + + pub fn get_or_insert(&self, raft_group_id: u64) -> Arc>> { + let global_stats = self.global_stats.clone(); + let mut memtables = self.slots[Self::slot_index(raft_group_id)].write(); + let memtable = memtables.entry(raft_group_id).or_insert_with(|| { + let memtable = + MemTable::with_allocator(raft_group_id, global_stats.clone(), &self.allocator); + Arc::new(RwLock::new(memtable)) + }); + memtable.clone() + } + + pub fn get(&self, raft_group_id: u64) -> Option>>> { + self.slots[Self::slot_index(raft_group_id)] + .read() + .get(&raft_group_id) + .cloned() + } + + pub fn insert(&self, raft_group_id: u64, memtable: Arc>>) { + self.slots[Self::slot_index(raft_group_id)] + .write() + .insert(raft_group_id, memtable); + } + + pub fn remove(&self, raft_group_id: u64, record_tombstone: bool) { + self.slots[Self::slot_index(raft_group_id)] + .write() + .remove(&raft_group_id); + if record_tombstone { + let mut removed_memtables = self.removed_memtables.lock(); + removed_memtables.push_back(raft_group_id); + } + } + + pub fn fold) -> B>(&self, mut init: B, fold: F) -> B { + for tables in &self.slots { + for memtable in tables.read().values() { + init = fold(init, &*memtable.read()); + } + } + init + } + + pub fn collect) -> bool>( + &self, + mut condition: F, + ) -> Vec>>> { + let mut memtables = Vec::new(); + for tables in &self.slots { + memtables.extend(tables.read().values().filter_map(|t| { + if condition(&*t.read()) { + return Some(t.clone()); + } + None + })); + } + memtables + } + + /// Returns a [`LogBatch`] containing `Command::Clean`s of all deleted + /// [`MemTable`]s. The records for these tables will be cleaned up + /// afterwards. + pub fn take_cleaned_region_logs(&self) -> LogBatch { + let mut log_batch = LogBatch::default(); + let mut removed_memtables = self.removed_memtables.lock(); + for id in removed_memtables.drain(..) { + log_batch.add_command(id, Command::Clean); + } + log_batch + } + + /// Returns a [`HashSet`] containing region IDs of all deleted + /// [`MemTable`]s. + /// + /// This method is only used for recovery. + #[cfg(test)] + pub fn cleaned_region_ids(&self) -> HashSet { + let mut ids = HashSet::default(); + let removed_memtables = self.removed_memtables.lock(); + for raft_id in removed_memtables.iter() { + ids.insert(*raft_id); + } + ids + } + + /// Returns `true` if it does not contains any memtable. + pub fn is_empty(&self) -> bool { + for i in 0..MEMTABLE_SLOT_COUNT { + if !self.slots[i].read().is_empty() { + return false; + } + } + true + } + + /// Merges with a newer neighbor [`MemTableAccessor`]. + /// + /// This method is only used for recovery. + pub fn merge_newer_neighbor(&self, mut rhs: Self) { + for slot in rhs.slots.iter_mut() { + for (raft_group_id, memtable) in slot.write().drain() { + self.get_or_insert(raft_group_id) + .write() + .merge_newer_neighbor(memtable.write().borrow_mut()); + } + } + // Discarding neighbor's tombstones, they will be applied by + // `MemTableRecoverContext`. + } + + /// Merges with a [`MemTableAccessor`] that contains only append data. + /// Assumes `self` contains all rewritten data. + /// + /// This method is only used for recovery. + pub fn merge_append_table(&self, mut rhs: Self) { + for slot in rhs.slots.iter_mut() { + for (id, memtable) in std::mem::take(&mut *slot.write()) { + if let Some(existing_memtable) = self.get(id) { + existing_memtable + .write() + .merge_append_table(&mut *memtable.write()); + } else { + self.insert(id, memtable); + } + } + } + // Tombstones from both table are identical. + debug_assert_eq!( + self.removed_memtables.lock().len(), + rhs.removed_memtables.lock().len() + ); + } + + /// Applies changes from log items that have been written to append queue. + pub fn apply_append_writes(&self, log_items: impl Iterator) { + for item in log_items { + if has_internal_key(&item) { + continue; + } + let raft = item.raft_group_id; + let memtable = self.get_or_insert(raft); + fail_point!( + "memtable_accessor::apply_append_writes::region_3", + raft == 3, + |_| {} + ); + match item.content { + LogItemContent::EntryIndexes(entries_to_add) => { + memtable.write().append(entries_to_add.0); + } + LogItemContent::Command(Command::Clean) => { + self.remove(raft, true /* record_tombstone */); + } + LogItemContent::Command(Command::Compact { index }) => { + memtable.write().compact_to(index); + } + LogItemContent::Kv(kv) => match kv.op_type { + OpType::Put => { + let value = kv.value.unwrap(); + memtable.write().put(kv.key, value, kv.file_id.unwrap()); + } + OpType::Del => { + let key = kv.key; + memtable.write().delete(key.as_slice()); + } + }, + } + } + } + + /// Applies changes from log items that are replayed from a append queue. + /// Assumes it haven't applied any rewrite data. + /// + /// This method is only used for recovery. + pub fn replay_append_writes(&self, log_items: impl Iterator) { + for item in log_items { + if has_internal_key(&item) { + continue; + } + let raft = item.raft_group_id; + let memtable = self.get_or_insert(raft); + match item.content { + LogItemContent::EntryIndexes(entries_to_add) => { + memtable.write().replay_append(entries_to_add.0); + } + LogItemContent::Command(Command::Clean) => { + self.remove(raft, true /* record_tombstone */); + } + LogItemContent::Command(Command::Compact { index }) => { + memtable.write().compact_to(index); + } + LogItemContent::Kv(kv) => match kv.op_type { + OpType::Put => { + let value = kv.value.unwrap(); + memtable.write().put(kv.key, value, kv.file_id.unwrap()); + } + OpType::Del => { + let key = kv.key; + memtable.write().delete(key.as_slice()); + } + }, + } + } + } + + /// Applies changes from log items that have been written to rewrite queue. + pub fn apply_rewrite_writes( + &self, + log_items: impl Iterator, + watermark: Option, + new_file: FileSeq, + ) { + for item in log_items { + if has_internal_key(&item) { + continue; + } + let raft = item.raft_group_id; + let memtable = self.get_or_insert(raft); + match item.content { + LogItemContent::EntryIndexes(entries_to_add) => { + memtable.write().rewrite(entries_to_add.0, watermark); + } + LogItemContent::Kv(kv) => match kv.op_type { + OpType::Put => { + let key = kv.key; + memtable.write().rewrite_key(key, watermark, new_file); + } + _ => unreachable!(), + }, + LogItemContent::Command(Command::Clean) => {} + _ => unreachable!(), + } + } + } + + /// Applies changes from log items that are replayed from a rewrite queue. + /// Assumes it haven't applied any append data. + /// + /// This method is only used for recovery. + pub fn replay_rewrite_writes(&self, log_items: impl Iterator) { + for item in log_items { + if has_internal_key(&item) { + continue; + } + let raft = item.raft_group_id; + let memtable = self.get_or_insert(raft); + match item.content { + LogItemContent::EntryIndexes(entries_to_add) => { + memtable.write().replay_rewrite(entries_to_add.0); + } + LogItemContent::Command(Command::Clean) => { + // Only append tombstone needs to be recorded. + self.remove(raft, false /* record_tombstone */); + } + LogItemContent::Command(Command::Compact { index }) => { + memtable.write().compact_to(index); + } + LogItemContent::Kv(kv) => match kv.op_type { + OpType::Put => { + let value = kv.value.unwrap(); + memtable.write().put(kv.key, value, kv.file_id.unwrap()); + } + OpType::Del => { + let key = kv.key; + memtable.write().delete(key.as_slice()); + } + }, + } + } + } + + pub fn apply_rewrite_atomic_group(&self, raft: u64, start: FileSeq, end: FileSeq) { + let memtable = self.get_or_insert(raft); + memtable.write().apply_rewrite_atomic_group(start, end); + } + + #[inline] + fn slot_index(id: u64) -> usize { + debug_assert!(MEMTABLE_SLOT_COUNT.is_power_of_two()); + hash_u64(id) as usize & (MEMTABLE_SLOT_COUNT - 1) + } +} + +#[inline] +fn has_internal_key(item: &LogItem) -> bool { + matches!(&item.content, LogItemContent::Kv(KeyValue { key, .. }) if crate::is_internal_key(key, None)) +} + +struct PendingAtomicGroup { + status: AtomicGroupStatus, + items: Vec, + tombstone_items: Vec, + start: FileSeq, + end: FileSeq, +} + +pub struct MemTableRecoverContext { + stats: Arc, + // Tombstones that needs to be transmitted to other context. + tombstone_items: Vec, + memtables: MemTableAccessor, + + // All atomic groups that are not yet completed. + // Each id maps to a list of groups. Each list contains at least one, at most two groups. + pending_atomic_groups: HashMap>, +} + +impl MemTableRecoverContext { + fn new() -> Self { + let stats = Arc::new(GlobalStats::default()); + Self { + stats: stats.clone(), + tombstone_items: Vec::new(), + memtables: MemTableAccessor::new(stats), + pending_atomic_groups: HashMap::new(), + } + } +} + +impl MemTableRecoverContext { + fn new_with_allocator(allocator: A) -> Self { + let stats = Arc::new(GlobalStats::default()); + Self { + stats: stats.clone(), + tombstone_items: Vec::new(), + memtables: MemTableAccessor::new_with_allocator(stats, allocator), + pending_atomic_groups: HashMap::new(), + } + } + + pub fn finish(self) -> (MemTableAccessor, Arc) { + (self.memtables, self.stats) + } + + pub fn merge_append_context(&self, append: MemTableRecoverContext) { + self.memtables + .apply_append_writes(append.tombstone_items.into_iter()); + self.memtables.merge_append_table(append.memtables); + } + + #[inline] + fn is_tombstone(item: &LogItem) -> bool { + match &item.content { + LogItemContent::Command(Command::Clean) + | LogItemContent::Command(Command::Compact { .. }) => true, + LogItemContent::Kv(KeyValue { op_type, .. }) if *op_type == OpType::Del => true, + _ => false, + } + } + + fn accept_new_group(&mut self, queue: LogQueue, id: u64, mut new_group: PendingAtomicGroup) { + assert_eq!(queue, LogQueue::Rewrite); + if let Some(groups) = self.pending_atomic_groups.get_mut(&id) { + let group = groups.last_mut().unwrap(); + match (group.status, new_group.status) { + (AtomicGroupStatus::End, AtomicGroupStatus::Begin) => { + groups.push(new_group); + } + // (begin, begin), (middle, begin) + (_, AtomicGroupStatus::Begin) => { + warn!( + "discard old atomic group, status: {:?}, raft_group_id: {:?}", + group.status, + group.items.first().map(|item| item.raft_group_id) + ); + *group = new_group; + } + // (end, middle), (end, end) + (AtomicGroupStatus::End, _) => { + warn!( + "discard new atomic group, status: {:?}, raft_group_id: {:?}", + new_group.status, + new_group.items.first().map(|item| item.raft_group_id) + ); + } + (AtomicGroupStatus::Begin, AtomicGroupStatus::Middle) + | (AtomicGroupStatus::Middle, AtomicGroupStatus::Middle) => { + group.items.append(&mut new_group.items); + group.tombstone_items.append(&mut new_group.tombstone_items); + assert!(group.end <= new_group.start); + group.end = new_group.end; + } + (AtomicGroupStatus::Middle, AtomicGroupStatus::End) => { + group.items.append(&mut new_group.items); + group.tombstone_items.append(&mut new_group.tombstone_items); + group.status = new_group.status; + assert!(group.end <= new_group.start); + group.end = new_group.end; + } + (AtomicGroupStatus::Begin, AtomicGroupStatus::End) => { + let mut group = groups.pop().unwrap(); + let mut rids = HashSet::with_capacity(1); + for item in group + .items + .iter() + .chain(group.tombstone_items.iter()) + .chain(new_group.items.iter()) + .chain(new_group.tombstone_items.iter()) + { + rids.insert(item.raft_group_id); + } + self.tombstone_items.append(&mut group.tombstone_items); + self.tombstone_items.append(&mut new_group.tombstone_items); + self.memtables + .replay_rewrite_writes(group.items.into_iter()); + self.memtables + .replay_rewrite_writes(new_group.items.into_iter()); + assert!(group.end <= new_group.start); + for rid in rids { + self.memtables + .apply_rewrite_atomic_group(rid, group.start, new_group.end); + } + } + } + if groups.is_empty() { + self.pending_atomic_groups.remove(&id); + } + } else { + self.pending_atomic_groups.insert(id, vec![new_group]); + } + } +} + +impl Default for MemTableRecoverContext { + fn default() -> Self { + Self::new() + } +} + +impl ReplayMachine for MemTableRecoverContext { + fn replay(&mut self, mut item_batch: LogItemBatch, file_id: FileId) -> Result<()> { + if file_id.queue == LogQueue::Append { + let mut new_tombstones = Vec::new(); + self.memtables + .replay_append_writes(item_batch.drain().filter(|item| { + if Self::is_tombstone(item) { + new_tombstones.push(item.clone()); + } + true + })); + self.tombstone_items.append(&mut new_tombstones); + } else { + let mut new_tombstones = Vec::new(); + let mut is_group = None; + let items = item_batch + .drain() + .filter(|item| { + if let Some(g) = AtomicGroupStatus::parse(item) { + if is_group.is_none() { + is_group = Some(g); + } else { + let msg = format!("skipped an atomic group: {g:?}"); + error!("{msg}"); + debug_assert!(false, "{}", msg); + } + return false; + } + if Self::is_tombstone(item) { + new_tombstones.push(item.clone()); + } + true + }) + .collect(); + if let Some((id, status)) = is_group { + self.accept_new_group( + file_id.queue, + id, + PendingAtomicGroup { + status, + items, + tombstone_items: new_tombstones, + start: file_id.seq, + end: file_id.seq, + }, + ); + } else { + self.tombstone_items.append(&mut new_tombstones); + self.memtables.replay_rewrite_writes(items.into_iter()); + } + } + Ok(()) + } + + fn merge(&mut self, mut rhs: Self, queue: LogQueue) -> Result<()> { + self.tombstone_items + .append(&mut rhs.tombstone_items.clone()); + for (id, groups) in rhs.pending_atomic_groups.drain() { + for group in groups { + self.accept_new_group(queue, id, group); + } + } + match queue { + LogQueue::Append => self + .memtables + .replay_append_writes(rhs.tombstone_items.into_iter()), + LogQueue::Rewrite => self + .memtables + .replay_rewrite_writes(rhs.tombstone_items.into_iter()), + } + self.memtables.merge_newer_neighbor(rhs.memtables); + Ok(()) + } +} + +pub struct MemTableRecoverContextFactory { + allocator: SelectedAllocator, +} + +impl MemTableRecoverContextFactory { + pub fn new(cfg: &Config) -> Self { + Self { + allocator: new_allocator(cfg), + } + } +} + +impl Factory> for MemTableRecoverContextFactory { + fn new_target(&self) -> MemTableRecoverContext { + MemTableRecoverContext::new_with_allocator(self.allocator.clone()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test_util::{catch_unwind_silent, generate_entry_indexes}; + + impl MemTable { + fn max_file_seq(&self, queue: LogQueue) -> Option { + let entry = match queue { + LogQueue::Append if self.rewrite_count == self.entry_indexes.len() => None, + LogQueue::Append => self.entry_indexes.back(), + LogQueue::Rewrite if self.rewrite_count == 0 => None, + LogQueue::Rewrite => self.entry_indexes.get(self.rewrite_count - 1), + }; + let ents_max = entry.map(|e| e.entries.unwrap().id.seq); + + let kvs_max = self.kvs_max_file_seq(queue); + match (ents_max, kvs_max) { + (Some(ents_max), Some(kvs_max)) => Some(FileSeq::max(kvs_max, ents_max)), + (Some(ents_max), None) => Some(ents_max), + (None, Some(kvs_max)) => Some(kvs_max), + (None, None) => None, + } + } + + pub fn kvs_max_file_seq(&self, queue: LogQueue) -> Option { + self.kvs + .values() + .filter(|v| v.1.queue == queue) + .fold(None, |max, v| { + if let Some(max) = max { + Some(std::cmp::max(max, v.1.seq)) + } else { + Some(v.1.seq) + } + }) + } + + pub fn fetch_all(&self, vec_idx: &mut Vec) { + if let Some((first, last)) = self.span() { + self.fetch_entries_to(first, last + 1, None, vec_idx) + .unwrap(); + } + } + + fn entries_size(&self) -> usize { + self.entry_indexes + .iter() + .fold(0, |acc, e| acc + e.entry_len) as usize + } + } + + #[test] + fn test_memtable_append() { + let region_id = 8; + let mut memtable = MemTable::new(region_id, Arc::new(GlobalStats::default())); + + // Append entries [10, 20) file_num = 1. + // after appending + // [10, 20) file_num = 1 + memtable.append(generate_entry_indexes( + 10, + 20, + FileId::new(LogQueue::Append, 1), + )); + assert_eq!(memtable.entries_size(), 10); + assert_eq!(memtable.min_file_seq(LogQueue::Append).unwrap(), 1); + assert_eq!(memtable.max_file_seq(LogQueue::Append).unwrap(), 1); + memtable.consistency_check(); + + // Empty. + memtable.append(Vec::new()); + + // Hole. + assert!( + catch_unwind_silent(|| memtable.append(generate_entry_indexes( + 21, + 22, + FileId::dummy(LogQueue::Append) + ))) + .is_err() + ); + memtable.consistency_check(); + + // Append entries [20, 30) file_num = 2. + // after appending: + // [10, 20) file_num = 1 + // [20, 30) file_num = 2 + memtable.append(generate_entry_indexes( + 20, + 30, + FileId::new(LogQueue::Append, 2), + )); + assert_eq!(memtable.entries_size(), 20); + assert_eq!(memtable.min_file_seq(LogQueue::Append).unwrap(), 1); + assert_eq!(memtable.max_file_seq(LogQueue::Append).unwrap(), 2); + memtable.consistency_check(); + assert_eq!( + memtable.global_stats.live_entries(LogQueue::Append), + memtable.entries_size() + ); + + // Partial overlap Appending. + // Append entries [25, 35) file_num = 3. + // After appending: + // [10, 20) file_num = 1 + // [20, 25) file_num = 2 + // [25, 35) file_num = 3 + memtable.append(generate_entry_indexes( + 25, + 35, + FileId::new(LogQueue::Append, 3), + )); + assert_eq!(memtable.entries_size(), 25); + assert_eq!(memtable.min_file_seq(LogQueue::Append).unwrap(), 1); + assert_eq!(memtable.max_file_seq(LogQueue::Append).unwrap(), 3); + memtable.consistency_check(); + assert_eq!( + memtable.global_stats.live_entries(LogQueue::Append), + memtable.entries_size() + ); + + // Full overlap Appending. + // Append entries [10, 40) file_num = 4. + // After appending: + // [10, 40) file_num = 4 + memtable.append(generate_entry_indexes( + 10, + 40, + FileId::new(LogQueue::Append, 4), + )); + assert_eq!(memtable.entries_size(), 30); + assert_eq!(memtable.min_file_seq(LogQueue::Append).unwrap(), 4); + assert_eq!(memtable.max_file_seq(LogQueue::Append).unwrap(), 4); + memtable.consistency_check(); + assert_eq!( + memtable.global_stats.live_entries(LogQueue::Append), + memtable.entries_size() + ); + + let global_stats = Arc::clone(&memtable.global_stats); + drop(memtable); + assert_eq!(global_stats.live_entries(LogQueue::Append), 0); + } + + #[test] + fn test_memtable_compact() { + let region_id = 8; + let mut memtable = MemTable::new(region_id, Arc::new(GlobalStats::default())); + + // After appending: + // [0, 10) file_num = 1 + // [10, 20) file_num = 2 + // [20, 25) file_num = 3 + memtable.append(generate_entry_indexes( + 0, + 10, + FileId::new(LogQueue::Append, 1), + )); + memtable.append(generate_entry_indexes( + 10, + 15, + FileId::new(LogQueue::Append, 2), + )); + memtable.append(generate_entry_indexes( + 15, + 20, + FileId::new(LogQueue::Append, 2), + )); + memtable.append(generate_entry_indexes( + 20, + 25, + FileId::new(LogQueue::Append, 3), + )); + + assert_eq!(memtable.entries_size(), 25); + assert_eq!(memtable.first_index().unwrap(), 0); + assert_eq!(memtable.last_index().unwrap(), 24); + assert_eq!(memtable.min_file_seq(LogQueue::Append).unwrap(), 1); + assert_eq!(memtable.max_file_seq(LogQueue::Append).unwrap(), 3); + assert_eq!( + memtable.global_stats.live_entries(LogQueue::Append), + memtable.entries_size() + ); + memtable.consistency_check(); + + // Compact to 5. + // Only index is needed to compact. + assert_eq!(memtable.compact_to(5), 5); + assert_eq!(memtable.entries_size(), 20); + assert_eq!(memtable.first_index().unwrap(), 5); + assert_eq!(memtable.last_index().unwrap(), 24); + assert_eq!(memtable.min_file_seq(LogQueue::Append).unwrap(), 1); + assert_eq!(memtable.max_file_seq(LogQueue::Append).unwrap(), 3); + assert_eq!( + memtable.global_stats.live_entries(LogQueue::Append), + memtable.entries_size() + ); + // Can't override compacted entries. + assert!( + catch_unwind_silent(|| memtable.append(generate_entry_indexes( + 4, + 5, + FileId::dummy(LogQueue::Append) + ))) + .is_err() + ); + memtable.consistency_check(); + + // Compact to 20. + assert_eq!(memtable.compact_to(20), 15); + assert_eq!(memtable.entries_size(), 5); + assert_eq!(memtable.first_index().unwrap(), 20); + assert_eq!(memtable.last_index().unwrap(), 24); + assert_eq!(memtable.min_file_seq(LogQueue::Append).unwrap(), 3); + assert_eq!(memtable.max_file_seq(LogQueue::Append).unwrap(), 3); + assert_eq!( + memtable.global_stats.live_entries(LogQueue::Append), + memtable.entries_size() + ); + memtable.consistency_check(); + + // Compact to 20 or smaller index, nothing happens. + assert_eq!(memtable.compact_to(20), 0); + assert_eq!(memtable.compact_to(15), 0); + assert_eq!(memtable.entries_size(), 5); + assert_eq!(memtable.first_index().unwrap(), 20); + assert_eq!(memtable.last_index().unwrap(), 24); + memtable.consistency_check(); + } + + #[test] + fn test_memtable_fetch() { + let region_id = 8; + let mut memtable = MemTable::new(region_id, Arc::new(GlobalStats::default())); + + let mut ents_idx = vec![]; + + // Fetch empty. + memtable.fetch_all(&mut ents_idx); + assert!(ents_idx.is_empty()); + memtable + .fetch_entries_to(0, 0, None, &mut ents_idx) + .unwrap(); + assert!(matches!( + memtable + .fetch_entries_to(0, 1, None, &mut ents_idx) + .unwrap_err(), + Error::EntryNotFound + )); + + // After appending: + // [0, 10) file_num = 1 + // [10, 15) file_num = 2 + // [15, 20) file_num = 2 + // [20, 25) file_num = 3 + memtable.append(generate_entry_indexes( + 0, + 10, + FileId::new(LogQueue::Append, 1), + )); + memtable.append(generate_entry_indexes( + 10, + 20, + FileId::new(LogQueue::Append, 2), + )); + memtable.append(generate_entry_indexes( + 20, + 25, + FileId::new(LogQueue::Append, 3), + )); + + // Fetching all + memtable.fetch_all(&mut ents_idx); + assert_eq!(ents_idx.len(), 25); + assert_eq!(ents_idx[0].index, 0); + assert_eq!(ents_idx[24].index, 24); + + // After compact: + // [10, 15) file_num = 2 + // [15, 20) file_num = 2 + // [20, 25) file_num = 3 + assert_eq!(memtable.compact_to(10), 10); + + // Out of range fetching. + ents_idx.clear(); + assert!(matches!( + memtable + .fetch_entries_to(5, 15, None, &mut ents_idx) + .unwrap_err(), + Error::EntryCompacted + )); + + // Out of range fetching. + ents_idx.clear(); + assert!(matches!( + memtable + .fetch_entries_to(20, 30, None, &mut ents_idx) + .unwrap_err(), + Error::EntryNotFound + )); + + ents_idx.clear(); + memtable + .fetch_entries_to(20, 25, None, &mut ents_idx) + .unwrap(); + assert_eq!(ents_idx.len(), 5); + assert_eq!(ents_idx[0].index, 20); + assert_eq!(ents_idx[4].index, 24); + + ents_idx.clear(); + memtable + .fetch_entries_to(10, 15, None, &mut ents_idx) + .unwrap(); + assert_eq!(ents_idx.len(), 5); + assert_eq!(ents_idx[0].index, 10); + assert_eq!(ents_idx[4].index, 14); + + ents_idx.clear(); + memtable + .fetch_entries_to(10, 25, None, &mut ents_idx) + .unwrap(); + assert_eq!(ents_idx.len(), 15); + assert_eq!(ents_idx[0].index, 10); + assert_eq!(ents_idx[14].index, 24); + + // Max size limitation range fetching. + // Only can fetch [10, 20) because of size limitation, + ents_idx.clear(); + let max_size = Some(10); + memtable + .fetch_entries_to(10, 25, max_size, &mut ents_idx) + .unwrap(); + assert_eq!(ents_idx.len(), 10); + assert_eq!(ents_idx[0].index, 10); + assert_eq!(ents_idx[9].index, 19); + + // Even max size limitation is 0, at least fetch one entry. + ents_idx.clear(); + memtable + .fetch_entries_to(20, 25, Some(0), &mut ents_idx) + .unwrap(); + assert_eq!(ents_idx.len(), 1); + assert_eq!(ents_idx[0].index, 20); + } + + #[test] + fn test_memtable_fetch_rewrite() { + let region_id = 8; + let mut memtable = MemTable::new(region_id, Arc::new(GlobalStats::default())); + let (k1, v1) = (b"key1", b"value1"); + let (k2, v2) = (b"key2", b"value2"); + let (k3, v3) = (b"key3", b"value3"); + + // After appending: + // [0, 10) file_num = 1 + // [10, 20) file_num = 2 + // [20, 25) file_num = 3 + memtable.append(generate_entry_indexes( + 0, + 10, + FileId::new(LogQueue::Append, 1), + )); + memtable.put(k1.to_vec(), v1.to_vec(), FileId::new(LogQueue::Append, 1)); + memtable.append(generate_entry_indexes( + 10, + 20, + FileId::new(LogQueue::Append, 2), + )); + memtable.put(k2.to_vec(), v2.to_vec(), FileId::new(LogQueue::Append, 2)); + memtable.append(generate_entry_indexes( + 20, + 25, + FileId::new(LogQueue::Append, 3), + )); + memtable.put(k3.to_vec(), v3.to_vec(), FileId::new(LogQueue::Append, 3)); + memtable.consistency_check(); + + // Rewrite k1. + memtable.rewrite_key(k1.to_vec(), Some(1), 50); + let mut kvs = Vec::new(); + memtable.fetch_kvs_before(1, &mut kvs); + assert!(kvs.is_empty()); + memtable.fetch_rewritten_kvs(&mut kvs); + assert_eq!(kvs.len(), 1); + assert_eq!(kvs.pop().unwrap(), (k1.to_vec(), v1.to_vec())); + // Rewrite deleted k1. + memtable.delete(k1.as_ref()); + assert_eq!(memtable.global_stats.deleted_rewrite_entries(), 1); + memtable.rewrite_key(k1.to_vec(), Some(1), 50); + assert_eq!(memtable.get(k1.as_ref()), None); + memtable.fetch_rewritten_kvs(&mut kvs); + assert!(kvs.is_empty()); + assert_eq!(memtable.global_stats.deleted_rewrite_entries(), 2); + // Rewrite newer append k2/k3. + memtable.rewrite_key(k2.to_vec(), Some(1), 50); + memtable.fetch_rewritten_kvs(&mut kvs); + assert!(kvs.is_empty()); + memtable.rewrite_key(k3.to_vec(), None, 50); // Rewrite encounters newer append. + memtable.fetch_rewritten_kvs(&mut kvs); + assert!(kvs.is_empty()); + assert_eq!(memtable.global_stats.deleted_rewrite_entries(), 4); + // Rewrite k3 multiple times. + memtable.rewrite_key(k3.to_vec(), Some(10), 50); + memtable.rewrite_key(k3.to_vec(), None, 51); + memtable.rewrite_key(k3.to_vec(), Some(11), 52); + memtable.fetch_rewritten_kvs(&mut kvs); + assert_eq!(kvs.len(), 1); + assert_eq!(kvs.pop().unwrap(), (k3.to_vec(), v3.to_vec())); + + // Rewrite indexes: + // [0, 10) queue = rewrite, file_num = 1, + // [10, 20) file_num = 2 + // [20, 25) file_num = 3 + let ents_idx = generate_entry_indexes(0, 10, FileId::new(LogQueue::Rewrite, 1)); + memtable.rewrite(ents_idx, Some(1)); + assert_eq!(memtable.entries_size(), 25); + memtable.consistency_check(); + + let mut ents_idx = vec![]; + assert!(memtable + .fetch_entry_indexes_before(2, &mut ents_idx) + .is_ok()); + assert_eq!(ents_idx.len(), 10); + assert_eq!(ents_idx.last().unwrap().index, 19); + ents_idx.clear(); + assert!(memtable + .fetch_entry_indexes_before(1, &mut ents_idx) + .is_ok()); + assert!(ents_idx.is_empty()); + + ents_idx.clear(); + assert!(memtable + .fetch_rewritten_entry_indexes(&mut ents_idx) + .is_ok()); + assert_eq!(ents_idx.len(), 10); + assert_eq!(ents_idx.first().unwrap().index, 0); + assert_eq!(ents_idx.last().unwrap().index, 9); + } + + #[test] + fn test_memtable_kv_operations() { + fn key(i: u64) -> Vec { + format!("k{i}").as_bytes().to_vec() + } + fn value(i: u64) -> Vec { + format!("v{i}").as_bytes().to_vec() + } + + let region_id = 8; + let mut memtable = MemTable::new(region_id, Arc::new(GlobalStats::default())); + + memtable.put(key(1), value(1), FileId::new(LogQueue::Append, 1)); + memtable.put(key(5), value(5), FileId::new(LogQueue::Append, 5)); + assert_eq!(memtable.min_file_seq(LogQueue::Append).unwrap(), 1); + assert_eq!(memtable.max_file_seq(LogQueue::Append).unwrap(), 5); + assert_eq!(memtable.get(&key(1)), Some(value(1))); + assert_eq!(memtable.get(&key(5)), Some(value(5))); + + let mut res = Vec::new(); + memtable + .scan(None, None, false, |k, v| { + res.push((k.to_vec(), v.to_vec())); + false + }) + .unwrap(); + assert_eq!(res, vec![(key(1), value(1))]); + res.clear(); + memtable + .scan(None, None, true, |k, v| { + res.push((k.to_vec(), v.to_vec())); + false + }) + .unwrap(); + assert_eq!(res, vec![(key(5), value(5))]); + res.clear(); + memtable + .scan(Some(&key(5)), None, false, |key, value| { + res.push((key.to_vec(), value.to_vec())); + true + }) + .unwrap(); + assert_eq!(res, vec![(key(5), value(5))]); + res.clear(); + memtable + .scan(Some(&key(1)), Some(&key(5)), false, |key, value| { + res.push((key.to_vec(), value.to_vec())); + true + }) + .unwrap(); + assert_eq!(res, vec![(key(1), value(1))]); + + memtable.delete(&key(5)); + assert_eq!(memtable.get(&key(5)), None); + assert_eq!(memtable.min_file_seq(LogQueue::Append).unwrap(), 1); + assert_eq!(memtable.max_file_seq(LogQueue::Append).unwrap(), 1); + + memtable.put(key(1), value(1), FileId::new(LogQueue::Rewrite, 2)); + memtable.put(key(5), value(5), FileId::new(LogQueue::Rewrite, 3)); + assert_eq!(memtable.min_file_seq(LogQueue::Append), None); + assert_eq!(memtable.max_file_seq(LogQueue::Append), None); + assert_eq!(memtable.min_file_seq(LogQueue::Rewrite).unwrap(), 2); + assert_eq!(memtable.max_file_seq(LogQueue::Rewrite).unwrap(), 3); + assert_eq!(memtable.global_stats.rewrite_entries(), 2); + + memtable.delete(&key(1)); + assert_eq!(memtable.min_file_seq(LogQueue::Rewrite).unwrap(), 3); + assert_eq!(memtable.max_file_seq(LogQueue::Rewrite).unwrap(), 3); + assert_eq!(memtable.global_stats.deleted_rewrite_entries(), 1); + + memtable.put(key(5), value(5), FileId::new(LogQueue::Append, 7)); + assert_eq!(memtable.min_file_seq(LogQueue::Rewrite), None); + assert_eq!(memtable.max_file_seq(LogQueue::Rewrite), None); + assert_eq!(memtable.min_file_seq(LogQueue::Append).unwrap(), 7); + assert_eq!(memtable.max_file_seq(LogQueue::Append).unwrap(), 7); + assert_eq!(memtable.global_stats.deleted_rewrite_entries(), 2); + } + + #[test] + fn test_memtable_get_entry() { + let region_id = 8; + let mut memtable = MemTable::new(region_id, Arc::new(GlobalStats::default())); + + assert_eq!(memtable.get_entry(0), None); + + // [5, 10) file_num = 1 + // [10, 20) file_num = 2 + memtable.append(generate_entry_indexes( + 5, + 10, + FileId::new(LogQueue::Append, 1), + )); + memtable.append(generate_entry_indexes( + 10, + 20, + FileId::new(LogQueue::Append, 2), + )); + + // Not in range. + assert_eq!(memtable.get_entry(2), None); + assert_eq!(memtable.get_entry(25), None); + + let entry_idx = memtable.get_entry(5); + assert_eq!(entry_idx.unwrap().index, 5); + } + + #[test] + fn test_memtable_rewrite() { + let region_id = 8; + let mut memtable = MemTable::new(region_id, Arc::new(GlobalStats::default())); + let mut expected_append = 0; + let mut expected_rewrite = 0; + let mut expected_deleted_rewrite = 0; + + // Rewrite to empty table. + let ents_idx = generate_entry_indexes(0, 10, FileId::new(LogQueue::Rewrite, 1)); + memtable.rewrite(ents_idx, Some(1)); + expected_rewrite += 10; + expected_deleted_rewrite += 10; + assert_eq!(memtable.min_file_seq(LogQueue::Rewrite), None); + assert_eq!( + memtable.global_stats.live_entries(LogQueue::Append), + expected_append + ); + assert_eq!(memtable.global_stats.rewrite_entries(), expected_rewrite); + assert_eq!( + memtable.global_stats.deleted_rewrite_entries(), + expected_deleted_rewrite + ); + + // Append and compact: + // [10, 20) file_num = 2 + // [20, 30) file_num = 3 + // [30, 40) file_num = 4 + // kk1 -> 2, kk2 -> 3, kk3 -> 4 + memtable.append(generate_entry_indexes( + 0, + 10, + FileId::new(LogQueue::Append, 1), + )); + memtable.append(generate_entry_indexes( + 10, + 20, + FileId::new(LogQueue::Append, 2), + )); + memtable.put( + b"kk1".to_vec(), + b"vv1".to_vec(), + FileId::new(LogQueue::Append, 2), + ); + memtable.append(generate_entry_indexes( + 20, + 30, + FileId::new(LogQueue::Append, 3), + )); + memtable.put( + b"kk2".to_vec(), + b"vv2".to_vec(), + FileId::new(LogQueue::Append, 3), + ); + memtable.append(generate_entry_indexes( + 30, + 40, + FileId::new(LogQueue::Append, 4), + )); + memtable.put( + b"kk3".to_vec(), + b"vv3".to_vec(), + FileId::new(LogQueue::Append, 4), + ); + expected_append += 4 * 10 + 3; + memtable.compact_to(10); + expected_append -= 10; + assert_eq!(memtable.entries_size(), 30); + assert_eq!(memtable.min_file_seq(LogQueue::Append).unwrap(), 2); + assert_eq!(memtable.max_file_seq(LogQueue::Append).unwrap(), 4); + assert_eq!( + memtable.global_stats.live_entries(LogQueue::Append), + expected_append + ); + memtable.consistency_check(); + + // Rewrite compacted entries. + // [10, 20) file_num = 2 + // [20, 30) file_num = 3 + // [30, 40) file_num = 4 + // kk1 -> 2, kk2 -> 3, kk3 -> 4 + let ents_idx = generate_entry_indexes(0, 10, FileId::new(LogQueue::Rewrite, 50)); + memtable.rewrite(ents_idx, Some(1)); + memtable.rewrite_key(b"kk0".to_vec(), Some(1), 50); + expected_rewrite += 10 + 1; + expected_deleted_rewrite += 10 + 1; + assert_eq!(memtable.min_file_seq(LogQueue::Append).unwrap(), 2); + assert_eq!(memtable.max_file_seq(LogQueue::Append).unwrap(), 4); + assert!(memtable.min_file_seq(LogQueue::Rewrite).is_none()); + assert!(memtable.max_file_seq(LogQueue::Rewrite).is_none()); + assert_eq!(memtable.rewrite_count, 0); + assert_eq!(memtable.get(b"kk0"), None); + assert_eq!( + memtable.global_stats.live_entries(LogQueue::Append), + expected_append + ); + assert_eq!(memtable.global_stats.rewrite_entries(), expected_rewrite); + assert_eq!( + memtable.global_stats.deleted_rewrite_entries(), + expected_deleted_rewrite + ); + memtable.consistency_check(); + + // Mixed rewrite. + // [10, 20) file_num = 100(r) + // [20, 30) file_num = 101(r) + // [30, 40) file_num = 4 + // kk1 -> 100(r), kk2 -> 101(r), kk3 -> 4 + let ents_idx = generate_entry_indexes(0, 20, FileId::new(LogQueue::Rewrite, 100)); + memtable.rewrite(ents_idx, Some(2)); + memtable.rewrite_key(b"kk0".to_vec(), Some(1), 50); + memtable.rewrite_key(b"kk1".to_vec(), Some(2), 100); + expected_append -= 10 + 1; + expected_rewrite += 20 + 2; + expected_deleted_rewrite += 10 + 1; + let ents_idx = generate_entry_indexes(20, 30, FileId::new(LogQueue::Rewrite, 101)); + memtable.rewrite(ents_idx, Some(3)); + memtable.rewrite_key(b"kk2".to_vec(), Some(3), 101); + expected_append -= 10 + 1; + expected_rewrite += 10 + 1; + assert_eq!(memtable.min_file_seq(LogQueue::Append).unwrap(), 4); + assert_eq!(memtable.max_file_seq(LogQueue::Append).unwrap(), 4); + assert_eq!(memtable.min_file_seq(LogQueue::Rewrite).unwrap(), 100); + assert_eq!(memtable.max_file_seq(LogQueue::Rewrite).unwrap(), 101); + assert_eq!(memtable.rewrite_count, 20); + assert_eq!(memtable.get(b"kk1"), Some(b"vv1".to_vec())); + assert_eq!( + memtable.global_stats.live_entries(LogQueue::Append), + expected_append + ); + assert_eq!(memtable.global_stats.rewrite_entries(), expected_rewrite); + assert_eq!( + memtable.global_stats.deleted_rewrite_entries(), + expected_deleted_rewrite + ); + memtable.consistency_check(); + + // Put some entries overwritting entires in file 4. Then try to rewrite. + // [10, 20) file_num = 100(r) + // [20, 30) file_num = 101(r) + // [30, 35) file_num = 4 -> 102(r) + // 35 file_num = 5 + // kk1 -> 100(r), kk2 -> 101(r), kk3 -> 5 + memtable.append(generate_entry_indexes( + 35, + 36, + FileId::new(LogQueue::Append, 5), + )); + expected_append -= 4; + memtable.put( + b"kk3".to_vec(), + b"vv33".to_vec(), + FileId::new(LogQueue::Append, 5), + ); + assert_eq!(memtable.last_index().unwrap(), 35); + memtable.consistency_check(); + let ents_idx = generate_entry_indexes(30, 40, FileId::new(LogQueue::Rewrite, 102)); + memtable.rewrite(ents_idx, Some(4)); + expected_append -= 5; + expected_rewrite += 10; + expected_deleted_rewrite += 5; + assert_eq!(memtable.min_file_seq(LogQueue::Append).unwrap(), 5); + assert_eq!(memtable.max_file_seq(LogQueue::Append).unwrap(), 5); + assert_eq!(memtable.min_file_seq(LogQueue::Rewrite).unwrap(), 100); + assert_eq!(memtable.max_file_seq(LogQueue::Rewrite).unwrap(), 102); + assert_eq!(memtable.rewrite_count, 25); + assert_eq!(memtable.get(b"kk3"), Some(b"vv33".to_vec())); + assert_eq!( + memtable.global_stats.live_entries(LogQueue::Append), + expected_append + ); + assert_eq!(memtable.global_stats.rewrite_entries(), expected_rewrite); + assert_eq!( + memtable.global_stats.deleted_rewrite_entries(), + expected_deleted_rewrite + ); + memtable.consistency_check(); + + // Compact after rewrite. + // [30, 35) file_num = 102(r) + // [35, 50) file_num = 6 + // kk1 -> 100(r), kk2 -> 101(r), kk3 -> 5 + memtable.append(generate_entry_indexes( + 35, + 50, + FileId::new(LogQueue::Append, 6), + )); + expected_append += 15 - 1; + memtable.compact_to(30); + expected_deleted_rewrite += 20; + assert_eq!(memtable.last_index().unwrap(), 49); + assert_eq!(memtable.rewrite_count, 5); + assert_eq!( + memtable.global_stats.live_entries(LogQueue::Append), + expected_append + ); + assert_eq!(memtable.global_stats.rewrite_entries(), expected_rewrite); + assert_eq!( + memtable.global_stats.deleted_rewrite_entries(), + expected_deleted_rewrite + ); + memtable.consistency_check(); + + // Squeeze some. + // [30, 35) file_num = 103(r) + // [35, 50) file_num = 6 + // kk1 -> 100(r), kk2 -> 101(r), kk3 -> 5 + let ents_idx = generate_entry_indexes(10, 60, FileId::new(LogQueue::Rewrite, 103)); + memtable.rewrite(ents_idx, None); + expected_rewrite += 50; + expected_deleted_rewrite += 50; + assert_eq!(memtable.first_index().unwrap(), 30); + assert_eq!(memtable.rewrite_count, 5); + assert_eq!( + memtable.global_stats.live_entries(LogQueue::Append), + expected_append + ); + assert_eq!(memtable.global_stats.rewrite_entries(), expected_rewrite); + assert_eq!( + memtable.global_stats.deleted_rewrite_entries(), + expected_deleted_rewrite + ); + memtable.consistency_check(); + + let global_stats = Arc::clone(&memtable.global_stats); + drop(memtable); + assert_eq!(global_stats.live_entries(LogQueue::Append), 0); + assert_eq!(global_stats.live_entries(LogQueue::Rewrite), 0); + } + + #[test] + fn test_memtable_merge_append() { + type TestMemTable = MemTable; + fn empty_table(id: u64) -> TestMemTable { + MemTable::new(id, Arc::new(GlobalStats::default())) + } + let cases = [ + |mut memtable: TestMemTable, on: Option| -> TestMemTable { + match on { + None => { + memtable.append(generate_entry_indexes( + 0, + 10, + FileId::new(LogQueue::Append, 1), + )); + memtable.append(generate_entry_indexes( + 7, + 15, + FileId::new(LogQueue::Append, 2), + )); + memtable.rewrite( + generate_entry_indexes(0, 10, FileId::new(LogQueue::Rewrite, 1)), + Some(1), + ); + } + Some(LogQueue::Append) => { + memtable.append(generate_entry_indexes( + 0, + 10, + FileId::new(LogQueue::Append, 1), + )); + memtable.append(generate_entry_indexes( + 7, + 15, + FileId::new(LogQueue::Append, 2), + )); + memtable.compact_to(7); + } + Some(LogQueue::Rewrite) => { + memtable.replay_rewrite(generate_entry_indexes( + 0, + 7, + FileId::new(LogQueue::Rewrite, 1), + )); + memtable.replay_rewrite(Vec::new()); + } + } + memtable + }, + |mut memtable: TestMemTable, on: Option| -> TestMemTable { + match on { + None => { + memtable.append(generate_entry_indexes( + 0, + 10, + FileId::new(LogQueue::Append, 1), + )); + memtable.append(generate_entry_indexes( + 7, + 15, + FileId::new(LogQueue::Append, 2), + )); + memtable.rewrite( + generate_entry_indexes(0, 10, FileId::new(LogQueue::Rewrite, 1)), + Some(1), + ); + memtable.compact_to(10); + } + Some(LogQueue::Append) => { + memtable.append(generate_entry_indexes( + 0, + 10, + FileId::new(LogQueue::Append, 1), + )); + memtable.append(generate_entry_indexes( + 7, + 15, + FileId::new(LogQueue::Append, 2), + )); + memtable.compact_to(10); + } + Some(LogQueue::Rewrite) => { + memtable.replay_rewrite(generate_entry_indexes( + 0, + 7, + FileId::new(LogQueue::Rewrite, 1), + )); + // By MemTableRecoveryContext. + memtable.compact_to(10); + } + } + memtable + }, + |mut memtable: TestMemTable, on: Option| -> TestMemTable { + match on { + None => { + memtable.append(generate_entry_indexes( + 0, + 10, + FileId::new(LogQueue::Append, 1), + )); + memtable.rewrite( + generate_entry_indexes(0, 10, FileId::new(LogQueue::Rewrite, 1)), + Some(1), + ); + memtable.append(generate_entry_indexes( + 10, + 15, + FileId::new(LogQueue::Append, 2), + )); + memtable.append(generate_entry_indexes( + 5, + 10, + FileId::new(LogQueue::Append, 2), + )); + } + Some(LogQueue::Append) => { + let mut m1 = empty_table(memtable.region_id); + m1.append(generate_entry_indexes( + 10, + 15, + FileId::new(LogQueue::Append, 2), + )); + let mut m2 = empty_table(memtable.region_id); + m2.append(generate_entry_indexes( + 5, + 10, + FileId::new(LogQueue::Append, 2), + )); + m1.merge_newer_neighbor(&mut m2); + memtable.merge_newer_neighbor(&mut m1); + } + Some(LogQueue::Rewrite) => { + memtable.replay_rewrite(generate_entry_indexes( + 0, + 10, + FileId::new(LogQueue::Rewrite, 1), + )); + } + } + memtable + }, + ]; + + // merge against empty table. + for (i, case) in cases.iter().enumerate() { + let region_id = i as u64; + let mut append = empty_table(region_id); + let mut rewrite = case(empty_table(region_id), Some(LogQueue::Rewrite)); + rewrite.merge_append_table(&mut append); + assert_eq!( + rewrite.entry_indexes, + case(empty_table(region_id), Some(LogQueue::Rewrite)).entry_indexes, + ); + assert!(append.entry_indexes.is_empty()); + + let mut append = case(empty_table(region_id), Some(LogQueue::Append)); + let mut rewrite = empty_table(region_id); + rewrite.merge_append_table(&mut append); + assert_eq!( + rewrite.entry_indexes, + case(empty_table(region_id), Some(LogQueue::Append)).entry_indexes + ); + assert!(append.entry_indexes.is_empty()); + } + + for (i, case) in cases.iter().enumerate() { + let region_id = i as u64; + let mut append = case(empty_table(region_id), Some(LogQueue::Append)); + let mut rewrite = case(empty_table(region_id), Some(LogQueue::Rewrite)); + rewrite.merge_append_table(&mut append); + let expected = case(empty_table(region_id), None); + assert_eq!( + rewrite.global_stats.live_entries(LogQueue::Append), + expected.global_stats.live_entries(LogQueue::Append) + ); + assert_eq!( + rewrite.global_stats.live_entries(LogQueue::Rewrite), + expected.global_stats.live_entries(LogQueue::Rewrite) + ); + assert_eq!(rewrite.entry_indexes, expected.entry_indexes); + assert!(append.entry_indexes.is_empty()); + } + } + + #[test] + fn test_memtables_merge_append_neighbor() { + let first_rid = 17; + let mut last_rid = first_rid; + + let mut batches = vec![ + LogItemBatch::with_capacity(0), + LogItemBatch::with_capacity(0), + LogItemBatch::with_capacity(0), + ]; + let files: Vec<_> = (0..batches.len()) + .map(|i| FileId::new(LogQueue::Append, 10 + i as u64)) + .collect(); + + // put (key1, v1) => del (key1) => put (key1, v2) + batches[0].put(last_rid, b"key1".to_vec(), b"val1".to_vec()); + batches[1].delete(last_rid, b"key1".to_vec()); + batches[2].put(last_rid, b"key1".to_vec(), b"val2".to_vec()); + + // put (k, _) => cleanup + last_rid += 1; + batches[0].put(last_rid, b"key".to_vec(), b"ANYTHING".to_vec()); + batches[1].add_command(last_rid, Command::Clean); + + // entries [1, 10] => compact 5 => entries [11, 20] + last_rid += 1; + batches[0].add_entry_indexes(last_rid, generate_entry_indexes(1, 11, files[0])); + batches[1].add_command(last_rid, Command::Compact { index: 5 }); + batches[2].add_entry_indexes(last_rid, generate_entry_indexes(11, 21, files[2])); + + // entries [1, 10] => entries [11, 20][5, 10] => compact 8 + last_rid += 1; + batches[0].add_entry_indexes(last_rid, generate_entry_indexes(1, 11, files[0])); + batches[1].add_entry_indexes(last_rid, generate_entry_indexes(11, 21, files[1])); + batches[1].add_entry_indexes(last_rid, generate_entry_indexes(5, 11, files[1])); + batches[2].add_command(last_rid, Command::Compact { index: 8 }); + + for b in batches.iter_mut() { + b.finish_write(FileBlockHandle::dummy(LogQueue::Append)); + } + + // reverse merge + let mut ctxs = VecDeque::default(); + for (batch, file_id) in batches.clone().into_iter().zip(files) { + let mut ctx = MemTableRecoverContext::default(); + ctx.replay(batch, file_id).unwrap(); + ctxs.push_back(ctx); + } + while ctxs.len() > 1 { + let (y, mut x) = (ctxs.pop_back().unwrap(), ctxs.pop_back().unwrap()); + x.merge(y, LogQueue::Append).unwrap(); + ctxs.push_back(x); + } + let (merged_memtables, merged_global_stats) = ctxs.pop_front().unwrap().finish(); + + // sequential apply + let sequential_global_stats = Arc::new(GlobalStats::default()); + let sequential_memtables = MemTableAccessor::new(sequential_global_stats.clone()); + for mut batch in batches.clone() { + sequential_memtables.apply_append_writes(batch.drain()); + } + + for rid in first_rid..=last_rid { + let m = merged_memtables.get(rid); + let s = sequential_memtables.get(rid); + if m.is_none() { + assert!(s.is_none()); + continue; + } + let merged = m.as_ref().unwrap().read(); + let sequential = s.as_ref().unwrap().read(); + let mut merged_vec = Vec::new(); + let mut sequential_vec = Vec::new(); + merged + .fetch_entry_indexes_before(u64::MAX, &mut merged_vec) + .unwrap(); + sequential + .fetch_entry_indexes_before(u64::MAX, &mut sequential_vec) + .unwrap(); + assert_eq!(merged_vec, sequential_vec); + merged_vec.clear(); + sequential_vec.clear(); + merged + .fetch_rewritten_entry_indexes(&mut merged_vec) + .unwrap(); + sequential + .fetch_rewritten_entry_indexes(&mut sequential_vec) + .unwrap(); + assert_eq!(merged_vec, sequential_vec); + let mut merged_vec = Vec::new(); + let mut sequential_vec = Vec::new(); + merged.fetch_kvs_before(u64::MAX, &mut merged_vec); + sequential.fetch_kvs_before(u64::MAX, &mut sequential_vec); + assert_eq!(merged_vec, sequential_vec); + merged_vec.clear(); + sequential_vec.clear(); + merged.fetch_rewritten_kvs(&mut merged_vec); + sequential.fetch_rewritten_kvs(&mut sequential_vec); + assert_eq!(merged_vec, sequential_vec); + } + assert_eq!( + merged_global_stats.live_entries(LogQueue::Append), + sequential_global_stats.live_entries(LogQueue::Append), + ); + assert_eq!( + merged_global_stats.rewrite_entries(), + sequential_global_stats.rewrite_entries(), + ); + assert_eq!( + merged_global_stats.deleted_rewrite_entries(), + sequential_global_stats.deleted_rewrite_entries(), + ); + } + + #[cfg(feature = "nightly")] + #[bench] + fn bench_memtable_single_put(b: &mut test::Bencher) { + let mut memtable = MemTable::new(0, Arc::new(GlobalStats::default())); + let key = b"some_key".to_vec(); + let value = vec![7; 12]; + b.iter(move || { + memtable.put(key.clone(), value.clone(), FileId::dummy(LogQueue::Append)); + }); + } + + #[cfg(feature = "nightly")] + #[bench] + fn bench_memtable_triple_puts(b: &mut test::Bencher) { + let mut memtable = MemTable::new(0, Arc::new(GlobalStats::default())); + let key0 = b"some_key0".to_vec(); + let key1 = b"some_key1".to_vec(); + let key2 = b"some_key2".to_vec(); + let value = vec![7; 12]; + b.iter(move || { + memtable.put(key0.clone(), value.clone(), FileId::dummy(LogQueue::Append)); + memtable.put(key1.clone(), value.clone(), FileId::dummy(LogQueue::Append)); + memtable.put(key2.clone(), value.clone(), FileId::dummy(LogQueue::Append)); + }); + } +} diff --git a/third/raft-engine/src/metrics.rs b/third/raft-engine/src/metrics.rs new file mode 100644 index 00000000..ef7577fe --- /dev/null +++ b/third/raft-engine/src/metrics.rs @@ -0,0 +1,310 @@ +// Copyright 2021 TiKV Project Authors. Licensed under Apache-2.0. + +use std::{ + cell::{RefCell, RefMut}, + ops::AddAssign, + time::{Duration, Instant}, +}; + +use prometheus::*; +use prometheus_static_metric::*; + +use crate::util::InstantExt; + +pub struct StopWatch { + metric: M, + start: Instant, +} + +impl StopWatch { + #[inline] + pub fn new(metric: M) -> Self { + Self { + metric, + start: Instant::now(), + } + } + + #[inline] + pub fn new_with(metric: M, start: Instant) -> Self { + Self { metric, start } + } +} + +impl Drop for StopWatch { + fn drop(&mut self) { + self.metric.observe(self.start.saturating_elapsed()); + } +} + +/// PerfContext records cumulative performance statistics of operations. +/// +/// Raft Engine will update the data in the thread-local PerfContext whenever +/// an opeartion is performed. +#[derive(Debug, Clone, Default)] +pub struct PerfContext { + /// Time spent encoding and compressing log entries. + pub log_populating_duration: Duration, + + /// Time spent waiting for the write group to form and get processed. + pub write_wait_duration: Duration, + + /// Time spent writing the logs to files. + pub log_write_duration: Duration, + + /// Time spent rotating the active log file. + pub log_rotate_duration: Duration, + + // Time spent synchronizing logs to the disk. + pub log_sync_duration: Duration, + + // Time spent applying the appended logs. + pub apply_duration: Duration, +} + +impl AddAssign<&'_ PerfContext> for PerfContext { + fn add_assign(&mut self, rhs: &PerfContext) { + self.log_populating_duration += rhs.log_populating_duration; + self.write_wait_duration += rhs.write_wait_duration; + self.log_write_duration += rhs.log_write_duration; + self.log_rotate_duration += rhs.log_rotate_duration; + self.log_sync_duration += rhs.log_sync_duration; + self.apply_duration += rhs.apply_duration; + } +} + +thread_local! { + static TLS_PERF_CONTEXT: RefCell = RefCell::new(PerfContext::default()); +} + +/// Gets a copy of the thread-local PerfContext. +pub fn get_perf_context() -> PerfContext { + TLS_PERF_CONTEXT.with(|c| c.borrow().clone()) +} + +/// Resets the thread-local PerfContext and takes its old value. +pub fn take_perf_context() -> PerfContext { + TLS_PERF_CONTEXT.with(|c| c.take()) +} + +/// Sets the value of the thread-local PerfContext. +pub fn set_perf_context(perf_context: PerfContext) { + TLS_PERF_CONTEXT.with(|c| *c.borrow_mut() = perf_context); +} + +pub(crate) struct PerfContextField

{ + projector: P, +} + +impl

PerfContextField

+where + P: Fn(&mut PerfContext) -> &mut Duration, +{ + pub fn new(projector: P) -> Self { + PerfContextField { projector } + } +} + +#[macro_export] +macro_rules! perf_context { + ($field: ident) => { + $crate::metrics::PerfContextField::new(|perf_context| &mut perf_context.$field) + }; +} + +pub trait TimeMetric { + fn observe(&self, duration: Duration); + + fn observe_since(&self, earlier: Instant) -> Duration { + let dur = earlier.saturating_elapsed(); + self.observe(dur); + dur + } +} + +impl<'a> TimeMetric for &'a Histogram { + fn observe(&self, duration: Duration) { + Histogram::observe(self, duration.as_secs_f64()); + } +} + +impl

TimeMetric for PerfContextField

+where + P: Fn(&mut PerfContext) -> &mut Duration, +{ + fn observe(&self, duration: Duration) { + TLS_PERF_CONTEXT.with(|perf_context| { + *RefMut::map(perf_context.borrow_mut(), &self.projector) += duration; + }) + } +} + +impl TimeMetric for (M1, M2) +where + M1: TimeMetric, + M2: TimeMetric, +{ + fn observe(&self, duration: Duration) { + self.0.observe(duration); + self.1.observe(duration); + } +} + +make_static_metric! { + pub label_enum LogQueueKind { + rewrite, + append, + } + + pub struct LogQueueHistogramVec: Histogram { + "type" => LogQueueKind, + } + + pub struct LogQueueCounterVec: IntCounter { + "type" => LogQueueKind, + } + + pub struct LogQueueGaugeVec: IntGauge { + "type" => LogQueueKind, + } +} + +lazy_static! { + // Write path. + pub static ref ENGINE_WRITE_DURATION_HISTOGRAM: Histogram = register_histogram!( + "raft_engine_write_duration_seconds", + "Bucketed histogram of Raft Engine write duration", + exponential_buckets(0.00005, 1.8, 26).unwrap() + ) + .unwrap(); + pub static ref ENGINE_WRITE_PREPROCESS_DURATION_HISTOGRAM: Histogram = register_histogram!( + "raft_engine_write_preprocess_duration_seconds", + "Bucketed histogram of Raft Engine write preprocess duration", + exponential_buckets(0.00005, 1.8, 26).unwrap() + ) + .unwrap(); + pub static ref ENGINE_WRITE_LEADER_DURATION_HISTOGRAM: Histogram = register_histogram!( + "raft_engine_write_leader_duration_seconds", + "Bucketed histogram of Raft Engine write leader duration", + exponential_buckets(0.00005, 1.8, 26).unwrap() + ) + .unwrap(); + pub static ref ENGINE_WRITE_APPLY_DURATION_HISTOGRAM: Histogram = register_histogram!( + "raft_engine_write_apply_duration_seconds", + "Bucketed histogram of Raft Engine write apply duration", + exponential_buckets(0.00005, 1.8, 26).unwrap() + ) + .unwrap(); + pub static ref ENGINE_WRITE_SIZE_HISTOGRAM: Histogram = register_histogram!( + "raft_engine_write_size", + "Bucketed histogram of Raft Engine write size", + exponential_buckets(256.0, 1.8, 22).unwrap() + ) + .unwrap(); + pub static ref ENGINE_WRITE_COMPRESSION_RATIO_HISTOGRAM: Histogram = register_histogram!( + "raft_engine_write_compression_ratio", + "Bucketed histogram of Raft Engine write compression ratio", + exponential_buckets(0.0005, 1.8, 16).unwrap() + ) + .unwrap(); + pub static ref LOG_ALLOCATE_DURATION_HISTOGRAM: Histogram = register_histogram!( + "raft_engine_allocate_log_duration_seconds", + "Bucketed histogram of Raft Engine allocate log duration", + exponential_buckets(0.00005, 1.8, 26).unwrap() + ) + .unwrap(); + pub static ref LOG_SYNC_DURATION_HISTOGRAM: Histogram = register_histogram!( + "raft_engine_sync_log_duration_seconds", + "Bucketed histogram of Raft Engine sync log duration", + exponential_buckets(0.00005, 1.8, 26).unwrap() + ) + .unwrap(); + pub static ref LOG_ROTATE_DURATION_HISTOGRAM: Histogram = register_histogram!( + "raft_engine_rotate_log_duration_seconds", + "Bucketed histogram of Raft Engine rotate log duration", + exponential_buckets(0.00005, 1.8, 26).unwrap() + ) + .unwrap(); + // Read path. + pub static ref ENGINE_READ_ENTRY_DURATION_HISTOGRAM: Histogram = register_histogram!( + "raft_engine_read_entry_duration_seconds", + "Bucketed histogram of Raft Engine read entry duration", + exponential_buckets(0.00005, 1.8, 26).unwrap() + ) + .unwrap(); + pub static ref ENGINE_READ_ENTRY_COUNT_HISTOGRAM: Histogram = register_histogram!( + "raft_engine_read_entry_count", + "Bucketed histogram of Raft Engine read entry count", + exponential_buckets(1.0, 1.8, 22).unwrap() + ) + .unwrap(); + pub static ref ENGINE_READ_MESSAGE_DURATION_HISTOGRAM: Histogram = register_histogram!( + "raft_engine_read_message_duration_seconds", + "Bucketed histogram of Raft Engine read message duration", + exponential_buckets(0.00005, 1.8, 26).unwrap() + ) + .unwrap(); + // Misc. + pub static ref ENGINE_PURGE_DURATION_HISTOGRAM: Histogram = register_histogram!( + "raft_engine_purge_duration_seconds", + "Bucketed histogram of Raft Engine purge expired files duration", + exponential_buckets(0.001, 1.8, 22).unwrap() + ) + .unwrap(); + pub static ref ENGINE_REWRITE_APPEND_DURATION_HISTOGRAM: Histogram = register_histogram!( + "raft_engine_rewrite_append_duration_seconds", + "Bucketed histogram of Raft Engine rewrite append queue duration", + exponential_buckets(0.001, 1.8, 22).unwrap() + ) + .unwrap(); + pub static ref ENGINE_REWRITE_REWRITE_DURATION_HISTOGRAM: Histogram = register_histogram!( + "raft_engine_rewrite_rewrite_duration_seconds", + "Bucketed histogram of Raft Engine rewrite rewrite queue duration", + exponential_buckets(0.001, 1.8, 22).unwrap() + ) + .unwrap(); + pub static ref BACKGROUND_REWRITE_BYTES: LogQueueHistogramVec = register_static_histogram_vec!( + LogQueueHistogramVec, + "raft_engine_background_rewrite_bytes", + "Bucketed histogram of bytes written during background rewrite", + &["type"], + exponential_buckets(256.0, 1.8, 22).unwrap() + ) + .unwrap(); + pub static ref LOG_FILE_COUNT: LogQueueGaugeVec = register_static_int_gauge_vec!( + LogQueueGaugeVec, + "raft_engine_log_file_count", + "Amount of log files in Raft engine", + &["type"] + ) + .unwrap(); + pub static ref RECYCLED_FILE_COUNT: LogQueueGaugeVec = register_static_int_gauge_vec!( + LogQueueGaugeVec, + "raft_engine_recycled_file_count", + "Amount of recycled files in Raft engine", + &["type"] + ) + .unwrap(); + pub static ref SWAP_FILE_COUNT: IntGauge = register_int_gauge!( + "raft_engine_swap_file_count", + "Amount of swap files in Raft engine" + ) + .unwrap(); + pub static ref LOG_ENTRY_COUNT: LogQueueGaugeVec = register_static_int_gauge_vec!( + LogQueueGaugeVec, + "raft_engine_log_entry_count", + "Number of log entries in Raft engine", + &["type"] + ) + .unwrap(); + pub static ref MEMORY_USAGE: IntGauge = register_int_gauge!( + "raft_engine_memory_usage", + "Memory in bytes used by Raft engine", + ) + .unwrap(); + pub static ref LOG_WRITE_BYTES_TOTAL: IntCounter = register_int_counter!( + "raft_engine_log_write_bytes_total", + "The log entries write to Raft engin in bytes", + ) + .unwrap(); +} diff --git a/third/raft-engine/src/pipe_log.rs b/third/raft-engine/src/pipe_log.rs new file mode 100644 index 00000000..63a43dde --- /dev/null +++ b/third/raft-engine/src/pipe_log.rs @@ -0,0 +1,208 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +//! A generic log storage. + +use std::cmp::Ordering; +use std::fmt::{self, Display}; + +use fail::fail_point; +use num_derive::{FromPrimitive, ToPrimitive}; +use num_traits::ToPrimitive; +use serde_repr::{Deserialize_repr, Serialize_repr}; +use strum::EnumIter; + +use crate::Result; + +/// The type of log queue. +#[repr(u8)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +pub enum LogQueue { + Append = 0, + Rewrite = 1, +} + +/// Sequence number for log file. It is unique within a log queue. +pub type FileSeq = u64; + +/// A unique identifier for a log file. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct FileId { + pub queue: LogQueue, + pub seq: FileSeq, +} + +impl FileId { + /// Creates a [`FileId`] from a [`LogQueue`] and a [`FileSeq`]. + pub fn new(queue: LogQueue, seq: FileSeq) -> Self { + Self { queue, seq } + } + + /// Creates a new [`FileId`] representing a non-existing file. + pub fn dummy(queue: LogQueue) -> Self { + Self { queue, seq: 0 } + } +} + +/// Order by freshness. +impl std::cmp::Ord for FileId { + fn cmp(&self, other: &Self) -> Ordering { + match (self.queue, other.queue) { + (LogQueue::Append, LogQueue::Rewrite) => Ordering::Greater, + (LogQueue::Rewrite, LogQueue::Append) => Ordering::Less, + _ => self.seq.cmp(&other.seq), + } + } +} + +impl std::cmp::PartialOrd for FileId { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +/// A logical pointer to a chunk of log file data. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct FileBlockHandle { + pub id: FileId, + pub offset: u64, + pub len: usize, +} + +impl FileBlockHandle { + /// Creates a new [`FileBlockHandle`] that points to nothing. + pub fn dummy(queue: LogQueue) -> Self { + Self { + id: FileId::dummy(queue), + offset: 0, + len: 0, + } + } +} + +/// Version of log file format. +#[repr(u64)] +#[derive( + Clone, + Copy, + Debug, + Eq, + PartialEq, + FromPrimitive, + ToPrimitive, + Serialize_repr, + Deserialize_repr, + EnumIter, + Default, +)] +pub enum Version { + #[default] + V1 = 1, + V2 = 2, +} + +impl Version { + pub fn has_log_signing(&self) -> bool { + fail_point!("pipe_log::version::force_enable_log_signing", |_| { true }); + match self { + Version::V1 => false, + Version::V2 => true, + } + } +} + +impl Display for Version { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.to_u64().unwrap()) + } +} + +pub struct LogFileContext { + pub id: FileId, + pub version: Version, +} + +impl LogFileContext { + pub fn new(id: FileId, version: Version) -> Self { + Self { id, version } + } + + /// Returns the `signature` in `u32` format. + pub fn get_signature(&self) -> Option { + if self.version.has_log_signing() { + // Here, the count of files will be always limited to less than + // `u32::MAX`. So, we just use the low 32 bit as the `signature` + // by default. + Some(self.id.seq as u32) + } else { + None + } + } +} + +/// Some bytes whose value might be dependent on the file it is written to. +pub trait ReactiveBytes { + fn as_bytes(&mut self, ctx: &LogFileContext) -> &[u8]; +} + +impl ReactiveBytes for &T +where + T: AsRef<[u8]> + ?Sized, +{ + fn as_bytes(&mut self, _ctx: &LogFileContext) -> &[u8] { + (*self).as_ref() + } +} + +/// A `PipeLog` serves reads and writes over multiple queues of log files. +/// +/// # Safety +/// +/// The pipe will panic if it encounters an unrecoverable failure. Otherwise the +/// operations on it should be atomic, i.e. failed operation will not affect +/// other ones, and user can still use it afterwards without breaking +/// consistency. +pub trait PipeLog: Sized { + /// Reads some bytes from the specified position. + fn read_bytes(&self, handle: FileBlockHandle) -> Result>; + + /// Appends some bytes to the specified log queue. Returns file position of + /// the written bytes. + fn append( + &self, + queue: LogQueue, + bytes: &mut T, + ) -> Result; + + /// Synchronizes all buffered writes. + /// + /// This operation might incurs a great latency overhead. It's advised to + /// call it once every batch of writes. + fn sync(&self, queue: LogQueue) -> Result<()>; + + /// Returns the smallest and largest file sequence number, still in use, + /// of the specified log queue. + fn file_span(&self, queue: LogQueue) -> (FileSeq, FileSeq); + + /// Returns the oldest file ID that is newer than `position`% of all files. + fn file_at(&self, queue: LogQueue, mut position: f64) -> FileSeq { + position = position.clamp(0.0, 1.0); + let (first, active) = self.file_span(queue); + let count = active - first + 1; + first + (count as f64 * position) as u64 + } + + /// Returns total size of the specified log queue. + fn total_size(&self, queue: LogQueue) -> usize; + + /// Rotates a new log file for the specified log queue. + /// + /// Implementation should be atomic under error conditions but not + /// necessarily panic-safe. + fn rotate(&self, queue: LogQueue) -> Result<()>; + + /// Deletes all log files smaller than the specified file ID. The scope is + /// limited to the log queue of `file_id`. + /// + /// Returns the number of deleted files. + fn purge_to(&self, file_id: FileId) -> Result; +} diff --git a/third/raft-engine/src/purge.rs b/third/raft-engine/src/purge.rs new file mode 100644 index 00000000..b7de5352 --- /dev/null +++ b/third/raft-engine/src/purge.rs @@ -0,0 +1,556 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use std::collections::VecDeque; +use std::collections::{HashMap, HashSet}; +use std::mem; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; + +use fail::fail_point; +use log::{info, warn}; +use parking_lot::{Mutex, RwLock}; + +use crate::config::Config; +use crate::engine::read_entry_bytes_from_file; +use crate::event_listener::EventListener; +use crate::log_batch::{AtomicGroupBuilder, LogBatch}; +use crate::memtable::{MemTableHandle, MemTables}; +use crate::metrics::*; +use crate::pipe_log::{FileBlockHandle, FileId, FileSeq, LogQueue, PipeLog}; +use crate::{GlobalStats, Result}; + +// Force compact region with oldest 20% logs. +const FORCE_COMPACT_RATIO: f64 = 0.2; +// Only rewrite region with oldest 70% logs. +const REWRITE_RATIO: f64 = 0.7; +// Only rewrite region with stale logs less than this threshold. +const MAX_REWRITE_ENTRIES_PER_REGION: usize = 32; +const MAX_COUNT_BEFORE_FORCE_REWRITE: u32 = 9; + +fn max_batch_bytes() -> usize { + fail_point!("max_rewrite_batch_bytes", |s| s + .unwrap() + .parse::() + .unwrap()); + 128 * 1024 +} + +fn max_forcely_sync_bytes() -> usize { + max_batch_bytes() * 4 +} + +pub struct PurgeManager

+where + P: PipeLog, +{ + cfg: Arc, + instance_id: u64, + memtables: MemTables, + pipe_log: Arc

, + global_stats: Arc, + listeners: Vec>, + + // Only one thread can run `purge_expired_files` at a time. + // + // This table records Raft Groups that should be force compacted before. Those that are not + // compacted in time (after `MAX_EPOCH_BEFORE_FORCE_REWRITE` epochs) will be force rewritten. + force_rewrite_candidates: Arc>>, +} + +impl

PurgeManager

+where + P: PipeLog, +{ + pub fn new( + cfg: Arc, + instance_id: u64, + memtables: MemTables, + pipe_log: Arc

, + global_stats: Arc, + listeners: Vec>, + ) -> PurgeManager

{ + PurgeManager { + cfg, + instance_id, + memtables, + pipe_log, + global_stats, + listeners, + force_rewrite_candidates: Arc::new(Mutex::new(HashMap::default())), + } + } + + pub fn purge_expired_files(&self) -> Result> { + let _t = StopWatch::new(&*ENGINE_PURGE_DURATION_HISTOGRAM); + let guard = self.force_rewrite_candidates.try_lock(); + if guard.is_none() { + warn!("Unable to purge expired files: locked"); + return Ok(vec![]); + } + let mut rewrite_candidate_regions = guard.unwrap(); + + let mut should_compact = HashSet::new(); + if self.needs_rewrite_log_files(LogQueue::Rewrite) { + should_compact.extend(self.rewrite_rewrite_queue()?); + self.rescan_memtables_and_purge_stale_files( + LogQueue::Rewrite, + self.pipe_log.file_span(LogQueue::Rewrite).1, + )?; + } + + if self.needs_rewrite_log_files(LogQueue::Append) { + if let (Some(rewrite_watermark), Some(compact_watermark)) = + self.append_queue_watermarks() + { + let (first_append, latest_append) = self.pipe_log.file_span(LogQueue::Append); + let append_queue_barrier = + self.listeners.iter().fold(latest_append, |barrier, l| { + l.first_file_not_ready_for_purge(LogQueue::Append) + .map_or(barrier, |f| std::cmp::min(f, barrier)) + }); + + // Ordering + // 1. Must rewrite tombstones AFTER acquiring `append_queue_barrier`, or + // deletion marks might be lost after restart. + // 2. Must rewrite tombstones BEFORE rewrite entries, or entries from recreated + // region might be lost after restart. + self.rewrite_append_queue_tombstones()?; + should_compact.extend(self.rewrite_or_compact_append_queue( + rewrite_watermark, + compact_watermark, + &mut rewrite_candidate_regions, + )?); + + if append_queue_barrier == first_append && first_append < latest_append { + warn!("Unable to purge expired files: blocked by barrier"); + } + self.rescan_memtables_and_purge_stale_files( + LogQueue::Append, + append_queue_barrier, + )?; + } + } + Ok(should_compact.into_iter().collect()) + } + + /// Rewrite append files with seqno no larger than `watermark`. When it's + /// None, rewrite the entire queue. Returns the number of purged files. + #[allow(dead_code)] + pub fn must_rewrite_append_queue( + &self, + watermark: Option, + exit_after_step: Option, + ) { + let _lk = self.force_rewrite_candidates.try_lock().unwrap(); + let (_, last) = self.pipe_log.file_span(LogQueue::Append); + let watermark = watermark.map_or(last, |w| std::cmp::min(w, last)); + if watermark == last { + self.pipe_log.rotate(LogQueue::Append).unwrap(); + } + self.rewrite_append_queue_tombstones().unwrap(); + if exit_after_step == Some(1) { + return; + } + self.rewrite_memtables(self.memtables.collect(|_| true), 0, Some(watermark)) + .unwrap(); + if exit_after_step == Some(2) { + return; + } + self.rescan_memtables_and_purge_stale_files( + LogQueue::Append, + self.pipe_log.file_span(LogQueue::Append).1, + ) + .unwrap(); + } + + #[allow(dead_code)] + pub fn must_rewrite_rewrite_queue(&self) { + let _lk = self.force_rewrite_candidates.try_lock().unwrap(); + self.rewrite_rewrite_queue().unwrap(); + self.rescan_memtables_and_purge_stale_files( + LogQueue::Rewrite, + self.pipe_log.file_span(LogQueue::Rewrite).1, + ) + .unwrap(); + } + + #[allow(dead_code)] + pub fn must_purge_all_stale(&self) { + let _lk = self.force_rewrite_candidates.try_lock().unwrap(); + self.pipe_log.rotate(LogQueue::Rewrite).unwrap(); + self.rescan_memtables_and_purge_stale_files( + LogQueue::Rewrite, + self.pipe_log.file_span(LogQueue::Rewrite).1, + ) + .unwrap(); + self.pipe_log.rotate(LogQueue::Append).unwrap(); + self.rescan_memtables_and_purge_stale_files( + LogQueue::Append, + self.pipe_log.file_span(LogQueue::Append).1, + ) + .unwrap(); + } + + pub(crate) fn needs_rewrite_log_files(&self, queue: LogQueue) -> bool { + let (first_file, active_file) = self.pipe_log.file_span(queue); + if active_file == first_file { + return false; + } + + let total_size = self.pipe_log.total_size(queue); + match queue { + LogQueue::Append => total_size > self.cfg.purge_threshold.0 as usize, + LogQueue::Rewrite => { + let compacted_rewrites_ratio = self.global_stats.deleted_rewrite_entries() as f64 + / self.global_stats.rewrite_entries() as f64; + total_size > self.cfg.purge_rewrite_threshold.unwrap().0 as usize + && compacted_rewrites_ratio > self.cfg.purge_rewrite_garbage_ratio + } + } + } + + // Returns (rewrite_watermark, compact_watermark). + // Files older than compact_watermark should be compacted; + // Files between compact_watermark and rewrite_watermark should be rewritten. + fn append_queue_watermarks(&self) -> (Option, Option) { + let queue = LogQueue::Append; + + let (first_file, active_file) = self.pipe_log.file_span(queue); + if active_file == first_file { + // Can't rewrite or force compact the active file. + return (None, None); + } + + let rewrite_watermark = self.pipe_log.file_at(queue, REWRITE_RATIO); + let compact_watermark = self.pipe_log.file_at(queue, FORCE_COMPACT_RATIO); + debug_assert!(active_file - 1 > 0); + ( + Some(std::cmp::min(rewrite_watermark, active_file - 1)), + Some(std::cmp::min(compact_watermark, active_file - 1)), + ) + } + + fn rewrite_or_compact_append_queue( + &self, + rewrite_watermark: FileSeq, + compact_watermark: FileSeq, + rewrite_candidates: &mut HashMap, + ) -> Result> { + let _t = StopWatch::new(&*ENGINE_REWRITE_APPEND_DURATION_HISTOGRAM); + debug_assert!(compact_watermark <= rewrite_watermark); + let mut should_compact = Vec::with_capacity(16); + + let mut new_candidates = HashMap::with_capacity(rewrite_candidates.len()); + let memtables = self.memtables.collect(|t| { + let min_append_seq = t.min_file_seq(LogQueue::Append).unwrap_or(u64::MAX); + let old = min_append_seq < compact_watermark || t.rewrite_count() > 0; + let has_something_to_rewrite = min_append_seq <= rewrite_watermark; + let append_heavy = t.has_at_least_some_entries_before( + FileId::new(LogQueue::Append, rewrite_watermark), + MAX_REWRITE_ENTRIES_PER_REGION + t.rewrite_count(), + ); + let full_heavy = t.has_at_least_some_entries_before( + FileId::new(LogQueue::Append, rewrite_watermark), + MAX_REWRITE_ENTRIES_PER_REGION, + ); + // counter is the times that target region triggers force compact. + let compact_counter = rewrite_candidates.get(&t.region_id()).unwrap_or(&0); + if old && full_heavy { + if *compact_counter < MAX_COUNT_BEFORE_FORCE_REWRITE { + // repeatedly ask user to compact these heavy regions. + should_compact.push(t.region_id()); + new_candidates.insert(t.region_id(), *compact_counter + 1); + return false; + } else { + // user is not responsive, do the rewrite ourselves. + should_compact.push(t.region_id()); + return has_something_to_rewrite; + } + } + !append_heavy && has_something_to_rewrite + }); + + self.rewrite_memtables( + memtables, + MAX_REWRITE_ENTRIES_PER_REGION, + Some(rewrite_watermark), + )?; + *rewrite_candidates = new_candidates; + + Ok(should_compact) + } + + // Rewrites the entire rewrite queue into new log files. + fn rewrite_rewrite_queue(&self) -> Result> { + let _t = StopWatch::new(&*ENGINE_REWRITE_REWRITE_DURATION_HISTOGRAM); + self.pipe_log.rotate(LogQueue::Rewrite)?; + + let mut force_compact_regions = vec![]; + let memtables = self.memtables.collect(|t| { + // if the region is force rewritten, we should also trigger compact. + if t.rewrite_count() > MAX_REWRITE_ENTRIES_PER_REGION { + force_compact_regions.push(t.region_id()); + } + t.min_file_seq(LogQueue::Rewrite).is_some() + }); + + self.rewrite_memtables(memtables, 0 /* expect_rewrites_per_memtable */, None)?; + self.global_stats.reset_rewrite_counters(); + Ok(force_compact_regions) + } + + fn rewrite_append_queue_tombstones(&self) -> Result<()> { + let mut log_batch = self.memtables.take_cleaned_region_logs(); + self.rewrite_impl( + &mut log_batch, + None, /* rewrite_watermark */ + true, /* sync */ + )?; + Ok(()) + } + + // Exclusive. + fn rescan_memtables_and_purge_stale_files(&self, queue: LogQueue, seq: FileSeq) -> Result<()> { + let min_seq = self.memtables.fold(seq, |min, t| { + t.min_file_seq(queue).map_or(min, |m| std::cmp::min(min, m)) + }); + + let purged = self.pipe_log.purge_to(FileId { + queue, + seq: min_seq, + })?; + if purged > 0 { + info!("purged {purged} expired log files for queue {queue:?}"); + for listener in &self.listeners { + listener.post_purge(FileId { + queue, + seq: min_seq - 1, + }); + } + } + Ok(()) + } + + fn rewrite_memtables( + &self, + memtables: Vec, + expect_rewrites_per_memtable: usize, + rewrite: Option, + ) -> Result<()> { + // Only use atomic group for rewrite-rewrite operation. + let needs_atomicity = (|| { + fail_point!("force_use_atomic_group", |_| true); + rewrite.is_none() + })(); + let mut log_batch = LogBatch::default(); + for memtable in memtables { + let mut entry_indexes = Vec::with_capacity(expect_rewrites_per_memtable); + let mut kvs = Vec::new(); + let region_id = { + let m = memtable.read(); + if let Some(rewrite) = rewrite { + m.fetch_entry_indexes_before(rewrite, &mut entry_indexes)?; + m.fetch_kvs_before(rewrite, &mut kvs); + } else { + m.fetch_rewritten_entry_indexes(&mut entry_indexes)?; + m.fetch_rewritten_kvs(&mut kvs); + } + m.region_id() + }; + + let mut previous_size = log_batch.approximate_size(); + let mut atomic_group = None; + let mut atomic_group_start = None; + let mut current_entry_indexes = Vec::new(); + let mut current_entries = Vec::new(); + let mut current_size = 0; + let mut unsynced_size = 0; + // Split the entries into smaller chunks, so that we don't OOM, and the + // compression overhead is not too high. + let mut entry_indexes = entry_indexes.into_iter().peekable(); + while let Some(ei) = entry_indexes.next() { + let entry = + read_entry_bytes_from_file(self.instance_id, self.pipe_log.as_ref(), &ei)?; + current_size += entry.len(); + current_entries.push(entry); + current_entry_indexes.push(ei); + unsynced_size += current_size; + // If this is the last entry, we handle them outside the loop. + if entry_indexes.peek().is_some() + && current_size + previous_size > max_batch_bytes() + { + if needs_atomicity { + if previous_size > 0 { + // We are certain that prev raft group and current raft group cannot fit + // inside one batch. + // To avoid breaking atomicity, we need to flush. + self.rewrite_impl(&mut log_batch, rewrite, false)?; + previous_size = 0; + if current_size <= max_batch_bytes() { + continue; + } + } + match atomic_group.as_mut() { + None => { + let mut g = AtomicGroupBuilder::default(); + g.begin(&mut log_batch); + atomic_group = Some(g); + } + Some(g) => { + g.add(&mut log_batch); + } + } + } + + log_batch.add_raw_entries( + region_id, + mem::take(&mut current_entry_indexes), + mem::take(&mut current_entries), + )?; + current_size = 0; + previous_size = 0; + let sync = if unsynced_size >= max_forcely_sync_bytes() { + // Avoiding too many unsynced size can make the later `fdatasync` in + // the append progress blocked for too long. + unsynced_size = 0; + true + } else { + false + }; + let handle = self.rewrite_impl(&mut log_batch, rewrite, sync)?.unwrap(); + if needs_atomicity && atomic_group_start.is_none() { + atomic_group_start = Some(handle.id.seq); + } + } + } + log_batch.add_raw_entries(region_id, current_entry_indexes, current_entries)?; + for (k, v) in kvs { + log_batch.put(region_id, k, v)?; + } + if let Some(g) = atomic_group.as_mut() { + g.end(&mut log_batch); + let handle = self.rewrite_impl(&mut log_batch, rewrite, false)?.unwrap(); + self.memtables.apply_rewrite_atomic_group( + region_id, + atomic_group_start.unwrap(), + handle.id.seq, + ); + } else if log_batch.approximate_size() > max_batch_bytes() { + self.rewrite_impl(&mut log_batch, rewrite, false)?; + } + } + self.rewrite_impl(&mut log_batch, rewrite, true)?; + Ok(()) + } + + fn rewrite_impl( + &self, + log_batch: &mut LogBatch, + rewrite_watermark: Option, + sync: bool, + ) -> Result> { + if log_batch.is_empty() { + debug_assert!(sync); + self.pipe_log.sync(LogQueue::Rewrite)?; + return Ok(None); + } + log_batch.finish_populate( + self.cfg.batch_compression_threshold.0 as usize, + self.cfg.compression_level, + )?; + let file_handle = self.pipe_log.append(LogQueue::Rewrite, log_batch)?; + if sync { + self.pipe_log.sync(LogQueue::Rewrite)?; + } + log_batch.finish_write(file_handle); + self.memtables.apply_rewrite_writes( + log_batch.drain(), + rewrite_watermark, + file_handle.id.seq, + ); + for listener in &self.listeners { + listener.post_apply_memtables(file_handle.id); + } + if rewrite_watermark.is_none() { + BACKGROUND_REWRITE_BYTES + .rewrite + .observe(file_handle.len as f64); + } else { + BACKGROUND_REWRITE_BYTES + .append + .observe(file_handle.len as f64); + } + Ok(Some(file_handle)) + } +} + +#[derive(Default)] +pub struct PurgeHook { + // Append queue log files that are not yet fully applied to MemTable must not be + // purged even when not referenced by any MemTable. + // In order to identify them, maintain a per-file reference counter for all active + // log files in append queue. No need to track rewrite queue because it is only + // written by purge thread. + active_log_files: RwLock>, +} + +impl EventListener for PurgeHook { + fn post_new_log_file(&self, file_id: FileId) { + if file_id.queue == LogQueue::Append { + let mut active_log_files = self.active_log_files.write(); + if let Some(seq) = active_log_files.back().map(|x| x.0) { + assert_eq!( + seq + 1, + file_id.seq, + "active log files should be contiguous" + ); + } + let counter = AtomicUsize::new(0); + active_log_files.push_back((file_id.seq, counter)); + } + } + + fn on_append_log_file(&self, handle: FileBlockHandle) { + if handle.id.queue == LogQueue::Append { + let active_log_files = self.active_log_files.read(); + assert!(!active_log_files.is_empty()); + let front = active_log_files[0].0; + let counter = &active_log_files[(handle.id.seq - front) as usize].1; + counter.fetch_add(1, Ordering::Release); + } + } + + fn post_apply_memtables(&self, file_id: FileId) { + if file_id.queue == LogQueue::Append { + let active_log_files = self.active_log_files.read(); + assert!(!active_log_files.is_empty()); + let front = active_log_files[0].0; + let counter = &active_log_files[(file_id.seq - front) as usize].1; + counter.fetch_sub(1, Ordering::Release); + } + } + + fn first_file_not_ready_for_purge(&self, queue: LogQueue) -> Option { + if queue == LogQueue::Append { + let active_log_files = self.active_log_files.read(); + for (id, counter) in active_log_files.iter() { + if counter.load(Ordering::Acquire) > 0 { + return Some(*id); + } + } + } + None + } + + fn post_purge(&self, file_id: FileId) { + if file_id.queue == LogQueue::Append { + let mut active_log_files = self.active_log_files.write(); + assert!(!active_log_files.is_empty()); + let front = active_log_files[0].0; + if front <= file_id.seq { + let mut purged = active_log_files.drain(0..=(file_id.seq - front) as usize); + assert_eq!(purged.next_back().unwrap().0, file_id.seq); + } + } + } +} diff --git a/third/raft-engine/src/swappy_allocator.rs b/third/raft-engine/src/swappy_allocator.rs new file mode 100644 index 00000000..0cb8db9b --- /dev/null +++ b/third/raft-engine/src/swappy_allocator.rs @@ -0,0 +1,1267 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +//! # Swappy Allocator + +use std::alloc::{AllocError, Allocator, Global, Layout}; +use std::fs::{File, OpenOptions}; +use std::path::{Path, PathBuf}; +use std::ptr::{self, NonNull}; +use std::sync::atomic::{AtomicBool, AtomicU32, AtomicUsize, Ordering}; +use std::sync::Arc; +use std::vec::Vec; + +use log::{error, warn}; +use memmap2::MmapMut; +use parking_lot::Mutex; + +use crate::metrics::SWAP_FILE_COUNT; + +const DEFAULT_PAGE_SIZE: usize = 64 * 1024 * 1024; // 64MB + +struct SwappyAllocatorCore +where + A: Allocator + Send + Sync, +{ + budget: usize, + path: PathBuf, + + mem_usage: AtomicUsize, + mem_allocator: A, + + maybe_swapped: AtomicBool, + page_seq: AtomicU32, + pages: Mutex>, +} + +/// An [`Allocator`] implementation that has a memory budget and can be swapped +/// out. +/// +/// The allocations of its internal metadata are not managed (i.e. allocated via +/// `std::alloc::Global`). Do NOT use it as the global allocator. +#[derive(Clone)] +pub struct SwappyAllocator(Arc>); + +impl SwappyAllocator { + pub fn new_over(path: &Path, budget: usize, alloc: A) -> SwappyAllocator { + if path.exists() { + if let Err(e) = std::fs::remove_dir_all(path) { + error!( + "Failed to clean up old swap directory: {e}. \ + There might be obsolete swap files left in {}.", + path.display() + ); + } + } + let core = SwappyAllocatorCore { + budget, + path: path.into(), + mem_usage: AtomicUsize::new(0), + mem_allocator: alloc, + maybe_swapped: AtomicBool::new(false), + page_seq: AtomicU32::new(0), + pages: Mutex::new(Vec::new()), + }; + SwappyAllocator(Arc::new(core)) + } + + #[inline] + pub fn memory_usage(&self) -> usize { + self.0.mem_usage.load(Ordering::Relaxed) + } + + #[inline] + fn is_swapped(&self, ptr: NonNull, exhaustive_check: bool) -> bool { + // Ordering: `maybe_swapped` must be read after the pointer is available. + std::sync::atomic::fence(Ordering::Acquire); + self.0.maybe_swapped.load(Ordering::Relaxed) + && (!exhaustive_check || self.0.pages.lock().iter().any(|page| page.contains(ptr))) + } + + #[inline] + fn allocate_swapped(&self, layout: Layout) -> Result, AllocError> { + let mut pages = self.0.pages.lock(); + match pages.last_mut().and_then(|p| p.allocate(layout)) { + None => { + self.0.maybe_swapped.store(true, Ordering::Relaxed); + // Ordering: `maybe_swapped` must be set before the page is created. + std::sync::atomic::fence(Ordering::Release); + pages.push( + Page::new( + &self.0.path, + self.0.page_seq.fetch_add(1, Ordering::Relaxed), + std::cmp::max(DEFAULT_PAGE_SIZE, layout.size()), + ) + .ok_or(AllocError)?, + ); + pages.last_mut().unwrap().allocate(layout).ok_or(AllocError) + } + Some(r) => Ok(r), + } + } +} + +impl SwappyAllocator { + pub fn new(path: &Path, budget: usize) -> SwappyAllocator { + Self::new_over(path, budget, Global) + } +} + +unsafe impl Allocator for SwappyAllocator { + #[inline] + fn allocate(&self, layout: Layout) -> Result, AllocError> { + // Always use mem_allocator to allocate empty pointer. + if layout.size() > 0 + && self.0.mem_usage.fetch_add(layout.size(), Ordering::Relaxed) + layout.size() + > self.0.budget + { + let swap_r = self.allocate_swapped(layout); + if swap_r.is_ok() { + self.0.mem_usage.fetch_sub(layout.size(), Ordering::Relaxed); + return swap_r; + } + } + self.0.mem_allocator.allocate(layout).map_err(|e| { + self.0.mem_usage.fetch_sub(layout.size(), Ordering::Relaxed); + e + }) + } + + #[inline] + unsafe fn deallocate(&self, ptr: NonNull, layout: Layout) { + if self.is_swapped(ptr, false) { + let mut pages = self.0.pages.lock(); + // Find the page it belongs to, then deallocate itself. + for i in 0..pages.len() { + if pages[i].contains(ptr) { + if pages[i].deallocate(ptr) { + let page = pages.remove(i); + page.release(&self.0.path); + } + if pages.is_empty() { + self.0.maybe_swapped.store(false, Ordering::Relaxed); + } + return; + } + } + } + self.0.mem_usage.fetch_sub(layout.size(), Ordering::Relaxed); + self.0.mem_allocator.deallocate(ptr, layout) + } + + #[inline] + fn allocate_zeroed(&self, layout: Layout) -> Result, AllocError> { + let ptr = self.allocate(layout)?; + unsafe { ptr.as_non_null_ptr().as_ptr().write_bytes(0, ptr.len()) } + Ok(ptr) + } + + #[inline] + unsafe fn grow( + &self, + ptr: NonNull, + old_layout: Layout, + new_layout: Layout, + ) -> Result, AllocError> { + let diff = new_layout.size() - old_layout.size(); + let mem_usage = self.0.mem_usage.fetch_add(diff, Ordering::Relaxed) + diff; + if mem_usage > self.0.budget || self.is_swapped(ptr, false) { + self.0.mem_usage.fetch_sub(diff, Ordering::Relaxed); + // Copied from std's blanket implementation // + debug_assert!( + new_layout.size() >= old_layout.size(), + "`new_layout.size()` must be greater than or equal to `old_layout.size()`" + ); + + let new_ptr = self.allocate(new_layout)?; + + // SAFETY: because `new_layout.size()` must be greater than or equal to + // `old_layout.size()`, both the old and new memory allocation are valid for + // reads and writes for `old_layout.size()` bytes. Also, because the + // old allocation wasn't yet deallocated, it cannot overlap + // `new_ptr`. Thus, the call to `copy_nonoverlapping` is + // safe. The safety contract for `dealloc` must be upheld by the caller. + #[allow(unused_unsafe)] + unsafe { + ptr::copy_nonoverlapping(ptr.as_ptr(), new_ptr.as_mut_ptr(), old_layout.size()); + self.deallocate(ptr, old_layout); + } + + Ok(new_ptr) + } else { + self.0 + .mem_allocator + .grow(ptr, old_layout, new_layout) + .map_err(|e| { + self.0.mem_usage.fetch_sub(diff, Ordering::Relaxed); + e + }) + } + } + + #[inline] + unsafe fn grow_zeroed( + &self, + ptr: NonNull, + old_layout: Layout, + new_layout: Layout, + ) -> Result, AllocError> { + let ptr = self.grow(ptr, old_layout, new_layout)?; + ptr.as_non_null_ptr().as_ptr().write_bytes(0, ptr.len()); + Ok(ptr) + } + + #[inline] + unsafe fn shrink( + &self, + ptr: NonNull, + old_layout: Layout, + new_layout: Layout, + ) -> Result, AllocError> { + if self.is_swapped(ptr, true) { + // This is a swapped page. + if self.0.mem_usage.load(Ordering::Relaxed) + new_layout.size() <= self.0.budget { + // It's probably okay to reallocate to memory. + // Copied from std's blanket implementation // + debug_assert!( + new_layout.size() <= old_layout.size(), + "`new_layout.size()` must be smaller than or equal to `old_layout.size()`" + ); + + let new_ptr = self.allocate(new_layout)?; + + // SAFETY: because `new_layout.size()` must be lower than or equal to + // `old_layout.size()`, both the old and new memory allocation are valid for + // reads and writes for `new_layout.size()` bytes. Also, because the + // old allocation wasn't yet deallocated, it cannot overlap + // `new_ptr`. Thus, the call to `copy_nonoverlapping` is + // safe. The safety contract for `dealloc` must be upheld by the caller. + #[allow(unused_unsafe)] + unsafe { + ptr::copy_nonoverlapping(ptr.as_ptr(), new_ptr.as_mut_ptr(), new_layout.size()); + self.deallocate(ptr, old_layout); + } + + Ok(new_ptr) + } else { + // The new layout should still be mapped to disk. Reuse old pointer. + Ok(NonNull::slice_from_raw_parts( + NonNull::new_unchecked(ptr.as_ptr()), + new_layout.size(), + )) + } + } else { + self.0 + .mem_allocator + .shrink(ptr, old_layout, new_layout) + .map(|p| { + self.0 + .mem_usage + .fetch_sub(old_layout.size() - new_layout.size(), Ordering::Relaxed); + p + }) + } + } +} + +/// A page of memory that is backed by an owned file. +struct Page { + seq: u32, + _f: File, + mmap: MmapMut, + /// The start offset of bytes that are free to use. + tail: usize, + /// Number of active pointers to this page. + ref_counter: usize, +} + +impl Page { + fn new(root: &Path, seq: u32, size: usize) -> Option { + fail::fail_point!("swappy::page::new_failure", |_| None); + if !root.exists() { + // Create directory only when it's needed. + std::fs::create_dir_all(root) + .map_err(|e| error!("Failed to create swap directory: {e}.")) + .ok()?; + } + let path = root.join(Self::page_file_name(seq)); + let f = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(true) + .open(path) + .map_err(|e| error!("Failed to open swap file: {e}")) + .ok()?; + f.set_len(size as u64) + .map_err(|e| error!("Failed to extend swap file: {e}")) + .ok()?; + let mmap = unsafe { + MmapMut::map_mut(&f) + .map_err(|e| error!("Failed to mmap swap file: {e}")) + .ok()? + }; + SWAP_FILE_COUNT.inc(); + Some(Self { + seq, + _f: f, + mmap, + tail: 0, + ref_counter: 0, + }) + } + + #[inline] + fn allocate(&mut self, layout: Layout) -> Option> { + unsafe { + let offset = self + .mmap + .as_ptr() + .add(self.tail) + .align_offset(layout.align()); + if self.tail + offset + layout.size() > self.mmap.len() { + None + } else { + let p = self.mmap.as_ptr().add(self.tail + offset); + self.tail += offset + layout.size(); + self.ref_counter += 1; + Some(NonNull::slice_from_raw_parts( + NonNull::new_unchecked(p as *mut u8), + layout.size(), + )) + } + } + } + + /// Returns whether the page can be retired. + #[inline] + fn deallocate(&mut self, _ptr: NonNull) -> bool { + self.ref_counter -= 1; + self.ref_counter == 0 + } + + /// Deletes this page and cleans up owned resources. + #[inline] + fn release(self, root: &Path) { + debug_assert_eq!(self.ref_counter, 0); + + // Somehow in Windows, we have to drop the mmap file handle first, otherwise + // the following file removal will return "Access Denied (OS Error 5)". + // Not using `#[cfg(windows)]` here is because it might do no harm in other + // operating systems - the mmap file handle is dropped anyhow. + drop(self.mmap); + + let path = root.join(Self::page_file_name(self.seq)); + if let Err(e) = std::fs::remove_file(path) { + warn!("Failed to delete swap file: {e}"); + } + SWAP_FILE_COUNT.dec(); + } + + /// Returns whether the pointer is contained in this page. + #[inline] + fn contains(&self, ptr: NonNull) -> bool { + unsafe { + let start = self.mmap.as_ptr(); + let end = (self.mmap.as_ptr()).add(self.mmap.len()); + let ptr = ptr.as_ptr() as *const u8; + ptr >= start && ptr < end + } + } + + #[inline] + fn page_file_name(seq: u32) -> String { + seq.to_string() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::test_util::catch_unwind_silent; + + #[derive(Default, Clone)] + struct WrappedGlobal { + err_mode: Arc, + alloc: Arc, + dealloc: Arc, + grow: Arc, + shrink: Arc, + } + impl WrappedGlobal { + fn stats(&self) -> (usize, usize, usize, usize) { + ( + self.alloc.load(Ordering::Relaxed), + self.dealloc.load(Ordering::Relaxed), + self.grow.load(Ordering::Relaxed), + self.shrink.load(Ordering::Relaxed), + ) + } + fn set_err_mode(&self, e: bool) { + self.err_mode.store(e, Ordering::Relaxed); + } + } + unsafe impl Allocator for WrappedGlobal { + fn allocate(&self, layout: Layout) -> Result, AllocError> { + self.alloc.fetch_add(1, Ordering::Relaxed); + if self.err_mode.load(Ordering::Relaxed) { + Err(AllocError) + } else { + std::alloc::Global.allocate(layout) + } + } + unsafe fn deallocate(&self, ptr: NonNull, layout: Layout) { + self.dealloc.fetch_add(1, Ordering::Relaxed); + std::alloc::Global.deallocate(ptr, layout) + } + unsafe fn grow( + &self, + ptr: NonNull, + old_layout: Layout, + new_layout: Layout, + ) -> Result, AllocError> { + self.grow.fetch_add(1, Ordering::Relaxed); + if self.err_mode.load(Ordering::Relaxed) { + Err(AllocError) + } else { + std::alloc::Global.grow(ptr, old_layout, new_layout) + } + } + unsafe fn shrink( + &self, + ptr: NonNull, + old_layout: Layout, + new_layout: Layout, + ) -> Result, AllocError> { + self.shrink.fetch_add(1, Ordering::Relaxed); + if self.err_mode.load(Ordering::Relaxed) { + Err(AllocError) + } else { + std::alloc::Global.shrink(ptr, old_layout, new_layout) + } + } + } + + type TestAllocator = SwappyAllocator; + + /// Borrows some memory temporarily. + struct BorrowMemory { + allocator: TestAllocator, + borrowed: usize, + } + + impl Drop for BorrowMemory { + fn drop(&mut self) { + self.allocator + .0 + .mem_usage + .fetch_sub(self.borrowed, Ordering::Relaxed); + } + } + + impl TestAllocator { + fn borrow_memory(&self, size: usize) -> BorrowMemory { + let allocator = SwappyAllocator(self.0.clone()); + allocator.0.mem_usage.fetch_add(size, Ordering::Relaxed); + BorrowMemory { + allocator, + borrowed: size, + } + } + } + + fn file_count(p: &Path) -> usize { + let mut files = 0; + if let Ok(iter) = p.read_dir() { + iter.for_each(|_| files += 1); + } + files + } + + #[test] + fn test_swappy_vec() { + let dir = tempfile::Builder::new() + .prefix("test_swappy_vec") + .tempdir() + .unwrap(); + + let global = WrappedGlobal::default(); + let allocator = TestAllocator::new_over(dir.path(), 1024, global); + let mut vec1: Vec = Vec::new_in(allocator.clone()); + assert_eq!(allocator.memory_usage(), 0); + vec1.resize(1024, 0); + assert_eq!(allocator.memory_usage(), 1024); + // Small vec that uses swap. + let mut vec2: Vec = Vec::new_in(allocator.clone()); + vec2.resize(16, 0); + assert_eq!(allocator.memory_usage(), 1024); + // Grow large vec to swap. + vec1.resize(2048, 0); + assert_eq!(allocator.memory_usage(), 0); + // Shrink large vec to free up memory. + vec1.truncate(4); + vec1.shrink_to_fit(); + assert_eq!(allocator.memory_usage(), 4); + // Grow small vec, should be in memory. + vec2.resize(32, 0); + assert_eq!(allocator.memory_usage(), 4 + 32); + + assert_eq!(file_count(dir.path()), 0); + } + + #[test] + fn test_page_refill() { + let dir = tempfile::Builder::new() + .prefix("test_page_refill") + .tempdir() + .unwrap(); + + let global = WrappedGlobal::default(); + let allocator = TestAllocator::new_over(dir.path(), 0, global); + + let mut vec: Vec = Vec::new_in(allocator.clone()); + assert_eq!(allocator.memory_usage(), 0); + vec.resize(DEFAULT_PAGE_SIZE, 0); + assert_eq!(allocator.memory_usage(), 0); + assert_eq!(file_count(dir.path()), 1); + vec.resize(DEFAULT_PAGE_SIZE * 2, 0); + assert_eq!(allocator.memory_usage(), 0); + assert_eq!(file_count(dir.path()), 1); + vec.resize(DEFAULT_PAGE_SIZE * 3, 0); + assert_eq!(allocator.memory_usage(), 0); + assert_eq!(file_count(dir.path()), 1); + vec.clear(); + vec.shrink_to_fit(); + assert_eq!(allocator.memory_usage(), 0); + assert_eq!(file_count(dir.path()), 0); + } + + #[test] + fn test_empty_pointer() { + let dir = tempfile::Builder::new() + .prefix("test_empty_pointer") + .tempdir() + .unwrap(); + let global = WrappedGlobal::default(); + let allocator = TestAllocator::new_over(dir.path(), 4, global.clone()); + let empty_p_1 = allocator + .allocate(Layout::from_size_align(0, 1).unwrap()) + .unwrap(); + assert_eq!(allocator.memory_usage(), 0); + assert_eq!(file_count(dir.path()), 0); + assert_eq!(global.stats(), (1, 0, 0, 0)); + + let borrow = allocator.borrow_memory(8); + + let empty_p_2 = allocator + .allocate(Layout::from_size_align(0, 1).unwrap()) + .unwrap(); + assert_eq!(allocator.memory_usage(), 8); + assert_eq!(file_count(dir.path()), 0); + assert_eq!(global.stats(), (2, 0, 0, 0)); + unsafe { + allocator.deallocate( + NonNull::new_unchecked(empty_p_1.as_ptr().as_mut_ptr()), + Layout::from_size_align(0, 1).unwrap(), + ); + assert_eq!(allocator.memory_usage(), 8); + assert_eq!(global.stats(), (2, 1, 0, 0)); + std::mem::drop(borrow); + assert_eq!(allocator.memory_usage(), 0); + allocator.deallocate( + NonNull::new_unchecked(empty_p_2.as_ptr().as_mut_ptr()), + Layout::from_size_align(0, 1).unwrap(), + ); + assert_eq!(allocator.memory_usage(), 0); + assert_eq!(global.stats(), (2, 2, 0, 0)); + } + } + + #[test] + fn test_shrink_reuse() { + let dir = tempfile::Builder::new() + .prefix("test_shrink_reuse") + .tempdir() + .unwrap(); + let global = WrappedGlobal::default(); + let allocator = TestAllocator::new_over(dir.path(), 16, global); + let mut vec: Vec = Vec::new_in(allocator.clone()); + vec.resize(DEFAULT_PAGE_SIZE, 0); + assert_eq!(file_count(dir.path()), 1); + vec.resize(DEFAULT_PAGE_SIZE / 2, 0); + // Didn't allocate new page. + assert_eq!(file_count(dir.path()), 1); + // Switch to memory. + vec.resize(4, 0); + vec.shrink_to_fit(); + assert_eq!(allocator.memory_usage(), 4); + assert_eq!(file_count(dir.path()), 0); + vec.clear(); + vec.shrink_to_fit(); + assert_eq!(allocator.memory_usage(), 0); + assert_eq!(file_count(dir.path()), 0); + } + + // Test that some grows are not routed to the underlying allocator. + #[test] + fn test_grow_regression() { + let dir = tempfile::Builder::new() + .prefix("test_grow_regression") + .tempdir() + .unwrap(); + let global = WrappedGlobal::default(); + let allocator = TestAllocator::new_over(dir.path(), 100, global.clone()); + let mut disk_vec: Vec = Vec::new_in(allocator.clone()); + assert_eq!(allocator.memory_usage(), 0); + disk_vec.resize(400, 0); + assert_eq!(global.stats(), (0, 0, 0, 0)); + assert_eq!(allocator.memory_usage(), 0); + assert_eq!(global.stats(), (0, 0, 0, 0)); + let mut mem_vec: Vec = Vec::with_capacity_in(8, allocator.clone()); + assert_eq!(allocator.memory_usage(), 8); + assert_eq!(global.stats(), (1, 0, 0, 0)); + // Grow calls . + mem_vec.resize(16, 0); + assert_eq!(allocator.memory_usage(), 16); + assert_eq!(global.stats(), (2, 1, 0, 0)); + // Deallocate all pages, calls when memory use is low. + disk_vec.truncate(1); + disk_vec.shrink_to_fit(); + assert_eq!(allocator.memory_usage(), 16 + 1); + assert_eq!(global.stats(), (3, 1, 0, 0)); + assert_eq!(file_count(dir.path()), 0); + // Grow calls now. + mem_vec.resize(32, 0); + assert_eq!(allocator.memory_usage(), 32 + 1); + assert_eq!(global.stats(), (3, 1, 1, 0)); + } + + // alloc_error_hook doesn't work well with asan. + #[cfg(not(sanitize = "address"))] + #[test] + fn test_mem_allocator_failure() { + let dir = tempfile::Builder::new() + .prefix("test_mem_allocator_failure") + .tempdir() + .unwrap(); + let global = WrappedGlobal::default(); + let allocator = TestAllocator::new_over(dir.path(), usize::MAX, global.clone()); + // Replace std hook because it will abort. + let std_hook = std::alloc::take_alloc_error_hook(); + std::alloc::set_alloc_error_hook(|_| { + panic!("oom"); + }); + // allocate failure + { + let mut vec: Vec = Vec::new_in(allocator.clone()); + global.set_err_mode(true); + assert!(catch_unwind_silent(|| { + vec.resize(16, 0); + }) + .is_err()); + assert_eq!(allocator.memory_usage(), 0); + global.set_err_mode(false); + vec.resize(16, 0); + assert_eq!(allocator.memory_usage(), 16); + } + // grow failure + { + let mut vec: Vec = Vec::new_in(allocator.clone()); + vec.resize(16, 0); + assert_eq!(allocator.memory_usage(), 16); + global.set_err_mode(true); + assert!(catch_unwind_silent(|| { + vec.resize(32, 0); + }) + .is_err()); + assert_eq!(allocator.memory_usage(), 16); + global.set_err_mode(false); + vec.resize(32, 0); + assert_eq!(allocator.memory_usage(), 32); + } + // shrink failure + { + let mut vec: Vec = Vec::new_in(allocator.clone()); + vec.resize(32, 0); + assert_eq!(allocator.memory_usage(), 32); + global.set_err_mode(true); + vec.resize(16, 0); + assert!(catch_unwind_silent(|| { + vec.shrink_to_fit(); + }) + .is_err()); + assert_eq!(allocator.memory_usage(), 32); + global.set_err_mode(false); + vec.shrink_to_fit(); + assert_eq!(allocator.memory_usage(), 16); + } + std::alloc::set_alloc_error_hook(std_hook); + } + + // Mashup of tests from std::alloc. + #[test] + fn test_swappy_vec_deque() { + type VecDeque = std::collections::VecDeque; + + fn collect(iter: T, a: TestAllocator) -> VecDeque + where + T: std::iter::IntoIterator, + { + let mut vec = VecDeque::new_in(a); + for i in iter { + vec.push_back(i); + } + vec + } + + fn collect_v(iter: T, a: TestAllocator) -> Vec + where + T: std::iter::IntoIterator, + { + let mut vec = Vec::new_in(a); + for i in iter { + vec.push(i); + } + vec + } + + let dir = tempfile::Builder::new() + .prefix("test_swappy_vec_deque") + .tempdir() + .unwrap(); + let global = WrappedGlobal::default(); + let allocator = TestAllocator::new_over(dir.path(), 0, global); + + { + // test_with_capacity_non_power_two + let mut d3 = VecDeque::with_capacity_in(3, allocator.clone()); + d3.push_back(1); + + // X = None, | = lo + // [|1, X, X] + assert_eq!(d3.pop_front(), Some(1)); + // [X, |X, X] + assert_eq!(d3.front(), None); + + // [X, |3, X] + d3.push_back(3); + // [X, |3, 6] + d3.push_back(6); + // [X, X, |6] + assert_eq!(d3.pop_front(), Some(3)); + + // Pushing the lo past half way point to trigger + // the 'B' scenario for growth + // [9, X, |6] + d3.push_back(9); + // [9, 12, |6] + d3.push_back(12); + + d3.push_back(15); + // There used to be a bug here about how the + // VecDeque made growth assumptions about the + // underlying Vec which didn't hold and lead + // to corruption. + // (Vec grows to next power of two) + // good- [9, 12, 15, X, X, X, X, |6] + // bug- [15, 12, X, X, X, |6, X, X] + assert_eq!(d3.pop_front(), Some(6)); + + // Which leads us to the following state which + // would be a failure case. + // bug- [15, 12, X, X, X, X, |X, X] + assert_eq!(d3.front(), Some(&9)); + } + { + // test_into_iter + // Empty iter + { + let d: VecDeque = VecDeque::new_in(allocator.clone()); + let mut iter = d.into_iter(); + + assert_eq!(iter.size_hint(), (0, Some(0))); + assert_eq!(iter.next(), None); + assert_eq!(iter.size_hint(), (0, Some(0))); + } + + // simple iter + { + let mut d = VecDeque::new_in(allocator.clone()); + for i in 0..5 { + d.push_back(i); + } + + let b = vec![0, 1, 2, 3, 4]; + assert_eq!(d.into_iter().collect::>(), b); + } + + // wrapped iter + { + let mut d = VecDeque::new_in(allocator.clone()); + for i in 0..5 { + d.push_back(i); + } + for i in 6..9 { + d.push_front(i); + } + + let b = vec![8, 7, 6, 0, 1, 2, 3, 4]; + assert_eq!(d.into_iter().collect::>(), b); + } + + // partially used + { + let mut d = VecDeque::new_in(allocator.clone()); + for i in 0..5 { + d.push_back(i); + } + for i in 6..9 { + d.push_front(i); + } + + let mut it = d.into_iter(); + assert_eq!(it.size_hint(), (8, Some(8))); + assert_eq!(it.next(), Some(8)); + assert_eq!(it.size_hint(), (7, Some(7))); + assert_eq!(it.next_back(), Some(4)); + assert_eq!(it.size_hint(), (6, Some(6))); + assert_eq!(it.next(), Some(7)); + assert_eq!(it.size_hint(), (5, Some(5))); + } + } + { + // test_eq_after_rotation + // test that two deques are equal even if elements are laid out differently + let len = 28; + let mut ring: VecDeque = collect(0..len, allocator.clone()); + let mut shifted = ring.clone(); + for _ in 0..10 { + // shift values 1 step to the right by pop, sub one, push + ring.pop_front(); + for elt in &mut ring { + *elt -= 1; + } + ring.push_back(len - 1); + } + + // try every shift + for _ in 0..shifted.capacity() { + shifted.pop_front(); + for elt in &mut shifted { + *elt -= 1; + } + shifted.push_back(len - 1); + assert_eq!(shifted, ring); + assert_eq!(ring, shifted); + } + } + { + // test_drop_with_pop + static mut DROPS: i32 = 0; + struct Elem; + impl Drop for Elem { + fn drop(&mut self) { + unsafe { + DROPS += 1; + } + } + } + + let mut ring = VecDeque::new_in(allocator.clone()); + ring.push_back(Elem); + ring.push_front(Elem); + ring.push_back(Elem); + ring.push_front(Elem); + + drop(ring.pop_back()); + drop(ring.pop_front()); + assert_eq!(unsafe { DROPS }, 2); + + drop(ring); + assert_eq!(unsafe { DROPS }, 4); + } + { + // test_reserve_grow + // test growth path A + // [T o o H] -> [T o o H . . . . ] + let mut ring = VecDeque::with_capacity_in(4, allocator.clone()); + for i in 0..3 { + ring.push_back(i); + } + ring.reserve(7); + for i in 0..3 { + assert_eq!(ring.pop_front(), Some(i)); + } + + // test growth path B + // [H T o o] -> [. T o o H . . . ] + let mut ring = VecDeque::with_capacity_in(4, allocator.clone()); + for i in 0..1 { + ring.push_back(i); + assert_eq!(ring.pop_front(), Some(i)); + } + for i in 0..3 { + ring.push_back(i); + } + ring.reserve(7); + for i in 0..3 { + assert_eq!(ring.pop_front(), Some(i)); + } + + // test growth path C + // [o o H T] -> [o o H . . . . T ] + let mut ring = VecDeque::with_capacity_in(4, allocator.clone()); + for i in 0..3 { + ring.push_back(i); + assert_eq!(ring.pop_front(), Some(i)); + } + for i in 0..3 { + ring.push_back(i); + } + ring.reserve(7); + for i in 0..3 { + assert_eq!(ring.pop_front(), Some(i)); + } + } + { + // test_append_permutations + fn construct_vec_deque( + push_back: usize, + pop_back: usize, + push_front: usize, + pop_front: usize, + allocator: TestAllocator, + ) -> VecDeque { + let mut out = VecDeque::new_in(allocator); + for a in 0..push_back { + out.push_back(a); + } + for b in 0..push_front { + out.push_front(push_back + b); + } + for _ in 0..pop_back { + out.pop_back(); + } + for _ in 0..pop_front { + out.pop_front(); + } + out + } + + // Miri is too slow + let max = if cfg!(miri) { 3 } else { 5 }; + + // Many different permutations of both the `VecDeque` getting appended to + // and the one getting appended are generated to check `append`. + // This ensures all 6 code paths of `append` are tested. + for src_push_back in 0..max { + for src_push_front in 0..max { + // doesn't pop more values than are pushed + for src_pop_back in 0..(src_push_back + src_push_front) { + for src_pop_front in 0..(src_push_back + src_push_front - src_pop_back) { + let src = construct_vec_deque( + src_push_back, + src_pop_back, + src_push_front, + src_pop_front, + allocator.clone(), + ); + + for dst_push_back in 0..max { + for dst_push_front in 0..max { + for dst_pop_back in 0..(dst_push_back + dst_push_front) { + for dst_pop_front in + 0..(dst_push_back + dst_push_front - dst_pop_back) + { + let mut dst = construct_vec_deque( + dst_push_back, + dst_pop_back, + dst_push_front, + dst_pop_front, + allocator.clone(), + ); + let mut src = src.clone(); + + // Assert that appending `src` to `dst` gives the same + // order + // of values as iterating over both in sequence. + let correct = collect_v( + dst.iter().chain(src.iter()).cloned(), + allocator.clone(), + ); + dst.append(&mut src); + assert_eq!(dst, correct); + assert!(src.is_empty()); + } + } + } + } + } + } + } + } + } + { + // test_append_double_drop + struct DropCounter<'a> { + count: &'a mut u32, + } + + impl Drop for DropCounter<'_> { + fn drop(&mut self) { + *self.count += 1; + } + } + let (mut count_a, mut count_b) = (0, 0); + { + let mut a = VecDeque::new_in(allocator.clone()); + let mut b = VecDeque::new_in(allocator.clone()); + a.push_back(DropCounter { + count: &mut count_a, + }); + b.push_back(DropCounter { + count: &mut count_b, + }); + + a.append(&mut b); + } + assert_eq!(count_a, 1); + assert_eq!(count_b, 1); + } + { + // test_extend_ref + let mut v = VecDeque::new_in(allocator.clone()); + v.push_back(1); + v.extend(&[2, 3, 4]); + + assert_eq!(v.len(), 4); + assert_eq!(v[0], 1); + assert_eq!(v[1], 2); + assert_eq!(v[2], 3); + assert_eq!(v[3], 4); + + let mut w = VecDeque::new_in(allocator.clone()); + w.push_back(5); + w.push_back(6); + v.extend(&w); + + assert_eq!(v.len(), 6); + assert_eq!(v[0], 1); + assert_eq!(v[1], 2); + assert_eq!(v[2], 3); + assert_eq!(v[3], 4); + assert_eq!(v[4], 5); + assert_eq!(v[5], 6); + } + { + // test_rotate_left_random + let shifts = [ + 6, 1, 0, 11, 12, 1, 11, 7, 9, 3, 6, 1, 4, 0, 5, 1, 3, 1, 12, 8, 3, 1, 11, 11, 9, 4, + 12, 3, 12, 9, 11, 1, 7, 9, 7, 2, + ]; + let n = 12; + let mut v: VecDeque<_> = collect(0..n, allocator.clone()); + let mut total_shift = 0; + for shift in shifts.iter().cloned() { + v.rotate_left(shift); + total_shift += shift; + #[allow(clippy::needless_range_loop)] + for i in 0..n { + assert_eq!(v[i], (i + total_shift) % n); + } + } + } + { + // test_drain_leak + static mut DROPS: i32 = 0; + + #[derive(Debug, PartialEq, Eq)] + struct D(u32, bool); + + impl Drop for D { + fn drop(&mut self) { + unsafe { + DROPS += 1; + } + + if self.1 { + panic!("panic in `drop`"); + } + } + } + + let mut v = VecDeque::new_in(allocator.clone()); + v.push_back(D(4, false)); + v.push_back(D(5, false)); + v.push_back(D(6, false)); + v.push_front(D(3, false)); + v.push_front(D(2, true)); + v.push_front(D(1, false)); + v.push_front(D(0, false)); + + assert!(catch_unwind_silent(|| { + v.drain(1..=4); + }) + .is_err()); + + assert_eq!(unsafe { DROPS }, 4); + assert_eq!(v.len(), 3); + drop(v); + assert_eq!(unsafe { DROPS }, 7); + } + { + // test_zero_sized_push + const N: usize = 8; + + // Zero sized type + struct Zst; + + // Test that for all possible sequences of push_front / push_back, + // we end up with a deque of the correct size + + for len in 0..N { + let mut tester = VecDeque::with_capacity_in(len, allocator.clone()); + assert_eq!(tester.len(), 0); + assert!(tester.capacity() >= len); + for case in 0..(1 << len) { + assert_eq!(tester.len(), 0); + for bit in 0..len { + if case & (1 << bit) != 0 { + tester.push_front(Zst); + } else { + tester.push_back(Zst); + } + } + assert_eq!(tester.len(), len); + #[allow(clippy::iter_count)] + let iter_len = tester.iter().count(); + assert_eq!(iter_len, len); + tester.clear(); + } + } + } + { + // issue-58952 + let c = 2; + let bv = [2]; + let b = bv.iter().filter(|a| **a == c); + + let _a = collect( + vec![1, 2, 3] + .into_iter() + .filter(|a| b.clone().any(|b| *b == *a)) + .filter(|a| b.clone().any(|b| *b == *a)) + .filter(|a| b.clone().any(|b| *b == *a)) + .filter(|a| b.clone().any(|b| *b == *a)) + .filter(|a| b.clone().any(|b| *b == *a)) + .filter(|a| b.clone().any(|b| *b == *a)) + .filter(|a| b.clone().any(|b| *b == *a)) + .filter(|a| b.clone().any(|b| *b == *a)) + .filter(|a| b.clone().any(|b| *b == *a)) + .filter(|a| b.clone().any(|b| *b == *a)) + .filter(|a| b.clone().any(|b| *b == *a)) + .filter(|a| b.clone().any(|b| *b == *a)) + .filter(|a| b.clone().any(|b| *b == *a)) + .filter(|a| b.clone().any(|b| *b == *a)), + allocator.clone(), + ); + } + { + // issue-54477 + let mut vecdeque_13 = collect(vec![], allocator.clone()); + let mut vecdeque_29 = collect(vec![0], allocator.clone()); + vecdeque_29.insert(0, 30); + vecdeque_29.insert(1, 31); + vecdeque_29.insert(2, 32); + vecdeque_29.insert(3, 33); + vecdeque_29.insert(4, 34); + vecdeque_29.insert(5, 35); + + vecdeque_13.append(&mut vecdeque_29); + + assert_eq!( + vecdeque_13, + collect(vec![30, 31, 32, 33, 34, 35, 0], allocator,) + ); + } + + assert_eq!(file_count(dir.path()), 0); + } + + fn bench_allocator(a: &A) { + unsafe { + let ptr = a + .allocate_zeroed(Layout::from_size_align(8, 8).unwrap()) + .unwrap(); + let ptr = a + .grow_zeroed( + NonNull::new_unchecked(ptr.as_ptr().as_mut_ptr()), + Layout::from_size_align(8, 8).unwrap(), + Layout::from_size_align(16, 8).unwrap(), + ) + .unwrap(); + let ptr = a + .grow_zeroed( + NonNull::new_unchecked(ptr.as_ptr().as_mut_ptr()), + Layout::from_size_align(16, 8).unwrap(), + Layout::from_size_align(32, 8).unwrap(), + ) + .unwrap(); + let ptr = a + .grow_zeroed( + NonNull::new_unchecked(ptr.as_ptr().as_mut_ptr()), + Layout::from_size_align(32, 8).unwrap(), + Layout::from_size_align(64, 8).unwrap(), + ) + .unwrap(); + let ptr = a + .grow_zeroed( + NonNull::new_unchecked(ptr.as_ptr().as_mut_ptr()), + Layout::from_size_align(64, 8).unwrap(), + Layout::from_size_align(128, 8).unwrap(), + ) + .unwrap(); + let ptr = a + .shrink( + NonNull::new_unchecked(ptr.as_ptr().as_mut_ptr()), + Layout::from_size_align(128, 8).unwrap(), + Layout::from_size_align(8, 8).unwrap(), + ) + .unwrap(); + a.deallocate( + NonNull::new_unchecked(ptr.as_ptr().as_mut_ptr()), + Layout::from_size_align(8, 8).unwrap(), + ); + } + } + + #[bench] + fn bench_allocator_std_global(b: &mut test::Bencher) { + b.iter(move || { + bench_allocator(&std::alloc::Global); + }); + } + + #[bench] + fn bench_allocator_fast_path(b: &mut test::Bencher) { + let dir = tempfile::Builder::new() + .prefix("bench_allocator_fast_path") + .tempdir() + .unwrap(); + let allocator = SwappyAllocator::new(dir.path(), usize::MAX); + b.iter(move || { + bench_allocator(&allocator); + }); + } + + #[bench] + fn bench_allocator_slow_path(b: &mut test::Bencher) { + let dir = tempfile::Builder::new() + .prefix("bench_allocator_slow_path") + .tempdir() + .unwrap(); + let allocator = SwappyAllocator::new(dir.path(), 0); + b.iter(move || { + bench_allocator(&allocator); + }); + } +} diff --git a/third/raft-engine/src/test_util.rs b/third/raft-engine/src/test_util.rs new file mode 100644 index 00000000..ff10af05 --- /dev/null +++ b/third/raft-engine/src/test_util.rs @@ -0,0 +1,96 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use std::panic::{self, AssertUnwindSafe}; + +use raft::eraftpb::Entry; + +use crate::{ + memtable::EntryIndex, + pipe_log::{FileBlockHandle, FileId}, +}; + +pub fn generate_entries(begin_index: u64, end_index: u64, data: Option<&[u8]>) -> Vec { + let mut v = vec![Entry::default(); (end_index - begin_index) as usize]; + let mut index = begin_index; + for e in v.iter_mut() { + e.set_index(index); + if let Some(data) = data { + e.set_data(data.to_vec().into()) + } + index += 1; + } + v +} + +pub fn generate_entry_indexes(begin_idx: u64, end_idx: u64, file_id: FileId) -> Vec { + generate_entry_indexes_opt(begin_idx, end_idx, Some(file_id)) +} + +pub fn generate_entry_indexes_opt( + begin_idx: u64, + end_idx: u64, + file_id: Option, +) -> Vec { + assert!(end_idx >= begin_idx); + let mut ents_idx = vec![]; + for idx in begin_idx..end_idx { + let ent_idx = EntryIndex { + index: idx, + entries: file_id.map(|id| FileBlockHandle { + id, + offset: 0, + len: 0, + }), + entry_len: 1, + ..Default::default() + }; + + ents_idx.push(ent_idx); + } + ents_idx +} + +/// Catch panic while suppressing default panic hook. +pub fn catch_unwind_silent(f: F) -> std::thread::Result +where + F: FnOnce() -> R, +{ + let prev_hook = panic::take_hook(); + panic::set_hook(Box::new(|_| {})); + let result = panic::catch_unwind(AssertUnwindSafe(f)); + panic::set_hook(prev_hook); + result +} + +pub struct PanicGuard { + prev_hook: *mut (dyn Fn(&panic::PanicInfo<'_>) + Sync + Send + 'static), +} + +struct PointerHolder(*mut T); + +unsafe impl Send for PointerHolder {} +unsafe impl Sync for PointerHolder {} + +impl PanicGuard { + pub fn with_prompt(s: String) -> Self { + let prev_hook = Box::into_raw(panic::take_hook()); + let sendable_prev_hook = PointerHolder(prev_hook); + // FIXME: Use thread local hook. + panic::set_hook(Box::new(move |info| { + eprintln!("{s}"); + unsafe { (*sendable_prev_hook.0)(info) }; + })); + PanicGuard { prev_hook } + } +} + +impl Drop for PanicGuard { + fn drop(&mut self) { + if !std::thread::panicking() { + let _ = panic::take_hook(); + unsafe { + panic::set_hook(Box::from_raw(self.prev_hook)); + } + } + } +} diff --git a/third/raft-engine/src/util.rs b/third/raft-engine/src/util.rs new file mode 100644 index 00000000..7e1d09c0 --- /dev/null +++ b/third/raft-engine/src/util.rs @@ -0,0 +1,472 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use std::fmt::{self, Display, Write}; +use std::ops::{Div, Mul}; +use std::str::FromStr; +use std::time::{Duration, Instant}; + +use crc32fast::Hasher; +use serde::de::{self, Unexpected, Visitor}; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; + +const UNIT: u64 = 1; + +const BINARY_DATA_MAGNITUDE: u64 = 1024; +pub const B: u64 = UNIT; +pub const KIB: u64 = B * BINARY_DATA_MAGNITUDE; +pub const MIB: u64 = KIB * BINARY_DATA_MAGNITUDE; +pub const GIB: u64 = MIB * BINARY_DATA_MAGNITUDE; +pub const TIB: u64 = GIB * BINARY_DATA_MAGNITUDE; +pub const PIB: u64 = TIB * BINARY_DATA_MAGNITUDE; + +#[derive(Clone, Debug, Copy, PartialEq, Eq, PartialOrd)] +pub struct ReadableSize(pub u64); + +impl ReadableSize { + pub const fn kb(count: u64) -> ReadableSize { + ReadableSize(count * KIB) + } + + pub const fn mb(count: u64) -> ReadableSize { + ReadableSize(count * MIB) + } + + pub const fn gb(count: u64) -> ReadableSize { + ReadableSize(count * GIB) + } + + pub const fn as_mb(self) -> u64 { + self.0 / MIB + } +} + +impl Div for ReadableSize { + type Output = ReadableSize; + + fn div(self, rhs: u64) -> ReadableSize { + ReadableSize(self.0 / rhs) + } +} + +impl Div for ReadableSize { + type Output = u64; + + fn div(self, rhs: ReadableSize) -> u64 { + self.0 / rhs.0 + } +} + +impl Mul for ReadableSize { + type Output = ReadableSize; + + fn mul(self, rhs: u64) -> ReadableSize { + ReadableSize(self.0 * rhs) + } +} + +impl Serialize for ReadableSize { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let mut buffer = String::new(); + write!(buffer, "{self}").unwrap(); + serializer.serialize_str(&buffer) + } +} + +impl FromStr for ReadableSize { + type Err = String; + + // This method parses value in binary unit. + fn from_str(s: &str) -> Result { + let size_str = s.trim(); + if size_str.is_empty() { + return Err(format!("{s:?} is not a valid size.")); + } + + if !size_str.is_ascii() { + return Err(format!("ASCII string is expected, but got {s:?}")); + } + + // size: digits and '.' as decimal separator + let size_len = size_str + .to_string() + .chars() + .take_while(|c| char::is_ascii_digit(c) || ['.', 'e', 'E', '-', '+'].contains(c)) + .count(); + + // unit: alphabetic characters + let (size, unit) = size_str.split_at(size_len); + + let unit = match unit.trim() { + "K" | "KB" | "KiB" => KIB, + "M" | "MB" | "MiB" => MIB, + "G" | "GB" | "GiB" => GIB, + "T" | "TB" | "TiB" => TIB, + "P" | "PB" | "PiB" => PIB, + "B" | "" => B, + _ => { + return Err(format!( + "only B, KB, KiB, MB, MiB, GB, GiB, TB, TiB, PB, and PiB are supported: {s:?}" + )); + } + }; + + match size.parse::() { + Ok(n) => Ok(ReadableSize((n * unit as f64) as u64)), + Err(_) => Err(format!("invalid size string: {s:?}")), + } + } +} + +impl Display for ReadableSize { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let size = self.0; + if size == 0 { + write!(f, "{size}KiB") + } else if size % PIB == 0 { + write!(f, "{}PiB", size / PIB) + } else if size % TIB == 0 { + write!(f, "{}TiB", size / TIB) + } else if size % GIB == 0 { + write!(f, "{}GiB", size / GIB) + } else if size % MIB == 0 { + write!(f, "{}MiB", size / MIB) + } else if size % KIB == 0 { + write!(f, "{}KiB", size / KIB) + } else { + write!(f, "{size}B") + } + } +} + +impl<'de> Deserialize<'de> for ReadableSize { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + struct SizeVisitor; + + impl<'de> Visitor<'de> for SizeVisitor { + type Value = ReadableSize; + + fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + formatter.write_str("valid size") + } + + fn visit_i64(self, size: i64) -> Result + where + E: de::Error, + { + if size >= 0 { + self.visit_u64(size as u64) + } else { + Err(E::invalid_value(Unexpected::Signed(size), &self)) + } + } + + fn visit_u64(self, size: u64) -> Result + where + E: de::Error, + { + Ok(ReadableSize(size)) + } + + fn visit_str(self, size_str: &str) -> Result + where + E: de::Error, + { + size_str.parse().map_err(E::custom) + } + } + + deserializer.deserialize_any(SizeVisitor) + } +} + +pub trait InstantExt { + fn saturating_elapsed(&self) -> Duration; +} + +impl InstantExt for Instant { + #[inline] + fn saturating_elapsed(&self) -> Duration { + Instant::now().saturating_duration_since(*self) + } +} + +#[inline] +pub fn crc32(data: &[u8]) -> u32 { + let mut hasher = Hasher::new(); + hasher.update(data); + hasher.finalize() +} + +// Credit: [splitmix64 algorithm](https://xorshift.di.unimi.it/splitmix64.c) +#[inline] +pub fn hash_u64(mut i: u64) -> u64 { + i = (i ^ (i >> 30)).wrapping_mul(0xbf58476d1ce4e5b9); + i = (i ^ (i >> 27)).wrapping_mul(0x94d049bb133111eb); + i ^ (i >> 31) +} + +#[allow(dead_code)] +#[inline] +pub fn unhash_u64(mut i: u64) -> u64 { + i = (i ^ (i >> 31) ^ (i >> 62)).wrapping_mul(0x319642b2d24d8ec3); + i = (i ^ (i >> 27) ^ (i >> 54)).wrapping_mul(0x96de1b173f119089); + i ^ (i >> 30) ^ (i >> 60) +} + +pub mod lz4 { + use crate::{Error, Result}; + use std::{i32, ptr}; + + pub const DEFAULT_LZ4_COMPRESSION_LEVEL: usize = 1; + + /// Compress content in `buf[skip..]`, and append output to `buf`. + pub fn append_compress_block(buf: &mut Vec, skip: usize, level: usize) -> Result { + let buf_len = buf.len(); + let content_len = buf_len - skip; + let mut compression_ratio = 0.0; + if content_len > 0 { + if content_len > i32::MAX as usize { + return Err(Error::InvalidArgument(format!( + "Content too long {content_len}" + ))); + } + unsafe { + let bound = lz4_sys::LZ4_compressBound(content_len as i32); + debug_assert!(bound > 0); + + // Layout: { decoded_len | content } + buf.reserve(buf_len + 4 + bound as usize); + let buf_ptr = buf.as_mut_ptr(); + + let le_len = content_len.to_le_bytes(); + ptr::copy_nonoverlapping(le_len.as_ptr(), buf_ptr.add(buf_len), 4); + + let compressed = lz4_sys::LZ4_compress_fast( + buf_ptr.add(skip) as _, + buf_ptr.add(buf_len + 4) as _, + content_len as i32, + bound, + level as i32, + ); + if compressed == 0 { + return Err(Error::Other(box_err!("Compression failed"))); + } + compression_ratio = compressed as f64 / content_len as f64; + buf.set_len(buf_len + 4 + compressed as usize); + } + } + Ok(compression_ratio) + } + + pub fn decompress_block(src: &[u8]) -> Result> { + if src.len() > 4 { + unsafe { + let len = u32::from_le(ptr::read_unaligned(src.as_ptr() as *const u32)); + let mut dst = Vec::with_capacity(len as usize); + let l = lz4_sys::LZ4_decompress_safe( + src.as_ptr().add(4) as _, + dst.as_mut_ptr() as _, + src.len() as i32 - 4, + dst.capacity() as i32, + ); + if l == len as i32 { + dst.set_len(l as usize); + Ok(dst) + } else if l < 0 { + Err(Error::Other(box_err!("Decompression failed {l}"))) + } else { + Err(Error::Corruption(format!( + "Decompressed content length mismatch {l} != {len}" + ))) + } + } + } else if !src.is_empty() { + Err(Error::Corruption(format!( + "Content to compress too short {}", + src.len() + ))) + } else { + Ok(Vec::new()) + } + } + + #[cfg(test)] + mod tests { + #[test] + fn test_basic() { + let vecs: Vec> = vec![b"".to_vec(), b"123".to_vec(), b"12345678910".to_vec()]; + for mut vec in vecs.into_iter() { + let uncompressed_len = vec.len(); + let compression_ratio = + super::append_compress_block(&mut vec, 0, super::DEFAULT_LZ4_COMPRESSION_LEVEL) + .unwrap(); + if uncompressed_len == 0 { + assert_eq!(compression_ratio, 0.0); + } + let res = super::decompress_block(&vec[uncompressed_len..]).unwrap(); + assert_eq!(res, vec[..uncompressed_len].to_owned()); + } + } + } +} + +pub trait Factory: Send + Sync { + fn new_target(&self) -> Target; +} + +/// Returns an aligned `offset`. +/// +/// # Example: +/// +/// ```ignore +/// assert_eq!(round_up(18, 4), 20); +/// assert_eq!(round_up(64, 16), 64); +/// ``` +#[inline] +pub fn round_up(offset: usize, alignment: usize) -> usize { + (offset + alignment - 1) / alignment * alignment +} + +/// Returns an aligned `offset`. +/// +/// # Example: +/// +/// ```ignore +/// assert_eq!(round_down(18, 4), 16); +/// assert_eq!(round_down(64, 16), 64); +/// ``` +#[allow(dead_code)] +#[inline] +pub fn round_down(offset: usize, alignment: usize) -> usize { + offset / alignment * alignment +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_readable_size() { + let s = ReadableSize::kb(2); + assert_eq!(s.0, 2048); + assert_eq!(s.as_mb(), 0); + let s = ReadableSize::mb(2); + assert_eq!(s.0, 2 * 1024 * 1024); + assert_eq!(s.as_mb(), 2); + let s = ReadableSize::gb(2); + assert_eq!(s.0, 2 * 1024 * 1024 * 1024); + assert_eq!(s.as_mb(), 2048); + + assert_eq!((ReadableSize::mb(2) / 2).0, MIB); + assert_eq!((ReadableSize::mb(1) / 2).0, 512 * KIB); + assert_eq!(ReadableSize::mb(2) / ReadableSize::kb(1), 2048); + } + + #[test] + fn test_parse_readable_size() { + #[derive(Serialize, Deserialize)] + struct SizeHolder { + s: ReadableSize, + } + + let legal_cases = vec![ + (0, "0KiB"), + (2 * KIB, "2KiB"), + (4 * MIB, "4MiB"), + (5 * GIB, "5GiB"), + (7 * TIB, "7TiB"), + (11 * PIB, "11PiB"), + ]; + for (size, exp) in legal_cases { + let c = SizeHolder { + s: ReadableSize(size), + }; + let res_str = toml::to_string(&c).unwrap(); + let exp_str = format!("s = {exp:?}\n"); + assert_eq!(res_str, exp_str); + let res_size: SizeHolder = toml::from_str(&exp_str).unwrap(); + assert_eq!(res_size.s.0, size); + } + + let c = SizeHolder { + s: ReadableSize(512), + }; + let res_str = toml::to_string(&c).unwrap(); + assert_eq!(res_str, "s = \"512B\"\n"); + let res_size: SizeHolder = toml::from_str(&res_str).unwrap(); + assert_eq!(res_size.s.0, c.s.0); + + let decode_cases = vec![ + (" 0.5 PB", PIB / 2), + ("0.5 TB", TIB / 2), + ("0.5GB ", GIB / 2), + ("0.5MB", MIB / 2), + ("0.5KB", KIB / 2), + ("0.5P", PIB / 2), + ("0.5T", TIB / 2), + ("0.5G", GIB / 2), + ("0.5M", MIB / 2), + ("0.5K", KIB / 2), + ("23", 23), + ("1", 1), + ("1024B", KIB), + // units with binary prefixes + (" 0.5 PiB", PIB / 2), + ("1PiB", PIB), + ("0.5 TiB", TIB / 2), + ("2 TiB", TIB * 2), + ("0.5GiB ", GIB / 2), + ("787GiB ", GIB * 787), + ("0.5MiB", MIB / 2), + ("3MiB", MIB * 3), + ("0.5KiB", KIB / 2), + ("1 KiB", KIB), + // scientific notation + ("0.5e6 B", B * 500000), + ("0.5E6 B", B * 500000), + ("1e6B", B * 1000000), + ("8E6B", B * 8000000), + ("8e7", B * 80000000), + ("1e-1MB", MIB / 10), + ("1e+1MB", MIB * 10), + ("0e+10MB", 0), + ]; + for (src, exp) in decode_cases { + let src = format!("s = {src:?}"); + let res: SizeHolder = toml::from_str(&src).unwrap(); + assert_eq!(res.s.0, exp); + } + + let illegal_cases = vec![ + "0.5kb", "0.5kB", "0.5Kb", "0.5k", "0.5g", "b", "gb", "1b", "B", "1K24B", " 5_KB", + "4B7", "5M_", + ]; + for src in illegal_cases { + let src_str = format!("s = {src:?}"); + assert!(toml::from_str::(&src_str).is_err(), "{}", src); + } + } + + #[test] + fn test_unhash() { + assert_eq!(unhash_u64(hash_u64(777)), 777); + } + + #[test] + fn test_rounding() { + // round_up + assert_eq!(round_up(18, 4), 20); + assert_eq!(round_up(64, 16), 64); + assert_eq!(round_up(79, 4096), 4096); + // round_down + assert_eq!(round_down(18, 4), 16); + assert_eq!(round_down(64, 16), 64); + assert_eq!(round_down(79, 4096), 0); + } +} diff --git a/third/raft-engine/src/write_barrier.rs b/third/raft-engine/src/write_barrier.rs new file mode 100644 index 00000000..356a191c --- /dev/null +++ b/third/raft-engine/src/write_barrier.rs @@ -0,0 +1,376 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +//! Synchronizer of writes. +//! +//! This module relies heavily on unsafe codes. Extra call site constraints are +//! required to maintain memory safety. Use it with great caution. + +use std::cell::Cell; +use std::marker::PhantomData; +use std::ptr::NonNull; +use std::time::Instant; + +use fail::fail_point; +use parking_lot::{Condvar, Mutex}; + +use crate::PerfContext; + +type Ptr = Option>; + +/// The writer +pub struct Writer { + next: Cell>>, + payload: *mut P, + output: Option, + + pub(crate) sync: bool, + pub(crate) entered_time: Option, + pub(crate) perf_context_diff: PerfContext, +} + +impl Writer { + /// Creates a new writer. + /// + /// # Safety + /// + /// Data pointed by `payload` is mutably referenced by this writer. Do not + /// access the payload by its original name during this writer's lifetime. + pub fn new(payload: &mut P, sync: bool) -> Self { + Writer { + next: Cell::new(None), + payload: payload as *mut _, + output: None, + sync, + entered_time: None, + perf_context_diff: PerfContext::default(), + } + } + + /// Returns a mutable reference to the payload. + pub fn mut_payload(&mut self) -> &mut P { + unsafe { &mut *self.payload } + } + + /// Sets the output. This method is re-entrant. + pub fn set_output(&mut self, output: O) { + self.output = Some(output); + } + + /// Consumes itself and yields an output. + /// + /// # Panics + /// + /// Panics if called before being processed by a [`WriteBarrier`] or setting + /// the output itself. + pub fn finish(mut self) -> O { + self.output.take().unwrap() + } + + fn get_next(&self) -> Ptr> { + self.next.get() + } + + fn set_next(&self, next: Ptr>) { + self.next.set(next); + } +} + +/// A collection of writers. User thread (leader) that receives a [`WriteGroup`] +/// is responsible for processing its containing writers. +pub struct WriteGroup<'a, 'b, P: 'a, O: 'a> { + start: Ptr>, + back: Ptr>, + + ref_barrier: &'a WriteBarrier, + marker: PhantomData<&'b Writer>, +} + +impl<'a, 'b, P, O> WriteGroup<'a, 'b, P, O> { + pub fn iter_mut(&mut self) -> WriterIter<'_, 'a, 'b, P, O> { + WriterIter { + start: self.start, + back: self.back, + marker: PhantomData, + } + } +} + +impl<'a, 'b, P, O> Drop for WriteGroup<'a, 'b, P, O> { + fn drop(&mut self) { + self.ref_barrier.leader_exit(); + } +} + +/// An iterator over the [`Writer`]s in one [`WriteGroup`]. +pub struct WriterIter<'a, 'b, 'c, P: 'c, O: 'c> { + start: Ptr>, + back: Ptr>, + marker: PhantomData<&'a WriteGroup<'b, 'c, P, O>>, +} + +impl<'a, 'b, 'c, P, O> Iterator for WriterIter<'a, 'b, 'c, P, O> { + type Item = &'a mut Writer; + + fn next(&mut self) -> Option { + if self.start.is_none() { + None + } else { + let writer = unsafe { self.start.unwrap().as_mut() }; + if self.start == self.back { + self.start = None; + } else { + self.start = writer.get_next(); + } + Some(writer) + } + } +} + +struct WriteBarrierInner { + head: Cell>>, + tail: Cell>>, + + pending_leader: Cell>>, + pending_index: Cell, +} + +unsafe impl Send for WriteBarrierInner {} + +impl Default for WriteBarrierInner { + fn default() -> Self { + WriteBarrierInner { + head: Cell::new(None), + tail: Cell::new(None), + pending_leader: Cell::new(None), + pending_index: Cell::new(0), + } + } +} + +/// A synchronizer of [`Writer`]s. +pub struct WriteBarrier { + inner: Mutex>, + leader_cv: Condvar, + follower_cvs: [Condvar; 2], +} + +impl Default for WriteBarrier { + fn default() -> Self { + WriteBarrier { + leader_cv: Condvar::new(), + follower_cvs: [Condvar::new(), Condvar::new()], + inner: Mutex::new(WriteBarrierInner::default()), + } + } +} + +impl WriteBarrier { + /// Waits until the caller should perform some work. If `writer` has become + /// the leader of a set of writers, returns a [`WriteGroup`] that contains + /// them, `writer` included. + pub fn enter<'a>(&self, writer: &'a mut Writer) -> Option> { + let node = unsafe { Some(NonNull::new_unchecked(writer)) }; + let mut inner = self.inner.lock(); + if let Some(tail) = inner.tail.get() { + unsafe { + tail.as_ref().set_next(node); + } + inner.tail.set(node); + + if inner.pending_leader.get().is_some() { + // follower of next write group. + self.follower_cvs[inner.pending_index.get() % 2].wait(&mut inner); + return None; + } else { + // leader of next write group. + inner.pending_leader.set(node); + inner + .pending_index + .set(inner.pending_index.get().wrapping_add(1)); + // + self.leader_cv.wait(&mut inner); + inner.pending_leader.set(None); + } + } else { + // leader of a empty write group. proceed directly. + debug_assert!(inner.pending_leader.get().is_none()); + inner.head.set(node); + inner.tail.set(node); + } + + Some(WriteGroup { + start: node, + back: inner.tail.get(), + ref_barrier: self, + marker: PhantomData, + }) + } + + /// Must called when write group leader finishes processing its responsible + /// writers, and next write group should be formed. + fn leader_exit(&self) { + fail_point!("write_barrier::leader_exit", |_| {}); + let inner = self.inner.lock(); + if let Some(leader) = inner.pending_leader.get() { + // wake up leader of next write group. + self.leader_cv.notify_one(); + // wake up follower of current write group. + self.follower_cvs[inner.pending_index.get().wrapping_sub(1) % 2].notify_all(); + inner.head.set(Some(leader)); + } else { + // wake up follower of current write group. + self.follower_cvs[inner.pending_index.get() % 2].notify_all(); + inner.head.set(None); + inner.tail.set(None); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::mpsc; + use std::sync::{Arc, Barrier}; + use std::thread::{self, Builder as ThreadBuilder}; + use std::time::Duration; + + #[test] + fn test_sequential_groups() { + let barrier: WriteBarrier<(), u32> = Default::default(); + let mut leaders = 0; + let mut processed_writers = 0; + + for _ in 0..4 { + let mut writer = Writer::new(&mut (), false); + { + let mut wg = barrier.enter(&mut writer).unwrap(); + leaders += 1; + for writer in wg.iter_mut() { + writer.set_output(7); + processed_writers += 1; + } + } + assert_eq!(writer.finish(), 7); + } + + assert_eq!(processed_writers, 4); + assert_eq!(leaders, 4); + } + + struct ConcurrentWriteContext { + barrier: Arc>, + + seq: u32, + ths: Vec>, + leader_exit_tx: mpsc::SyncSender<()>, + leader_exit_rx: mpsc::Receiver<()>, + } + + impl ConcurrentWriteContext { + fn new() -> Self { + let (leader_exit_tx, leader_exit_rx) = mpsc::sync_channel(0); + Self { + barrier: Default::default(), + seq: 0, + ths: Vec::new(), + leader_exit_tx, + leader_exit_rx, + } + } + + // 1) create `n` writers and form a new write group + // 2) current active write group finishes writing and exits + // 3) the new write group enters writing phrase + fn step(&mut self, n: usize) { + if self.ths.is_empty() { + // ensure there is at least one active writer. + self.seq += 1; + let (leader_enter_tx, leader_enter_rx) = mpsc::channel(); + + let barrier = self.barrier.clone(); + let leader_exit_tx = self.leader_exit_tx.clone(); + let mut seq = self.seq; + self.ths.push( + ThreadBuilder::new() + .spawn(move || { + let mut writer = Writer::new(&mut seq, false); + { + let mut wg = barrier.enter(&mut writer).unwrap(); + leader_enter_tx.send(()).unwrap(); + let mut n = 0; + for w in wg.iter_mut() { + let p = *w.mut_payload(); + w.set_output(p); + n += 1; + } + assert_eq!(n, 1); + leader_exit_tx.send(()).unwrap(); + } + assert_eq!(writer.finish(), seq); + }) + .unwrap(), + ); + + leader_enter_rx.recv().unwrap(); + } + + let prev_writers = self.ths.len(); + let (leader_enter_tx, leader_enter_rx) = mpsc::channel(); + let start_thread = Arc::new(Barrier::new(n + 1)); + for _ in 0..n { + self.seq += 1; + + let barrier = self.barrier.clone(); + let start_thread = start_thread.clone(); + let leader_enter_tx_clone = leader_enter_tx.clone(); + let leader_exit_tx = self.leader_exit_tx.clone(); + let mut seq = self.seq; + self.ths.push( + ThreadBuilder::new() + .spawn(move || { + let mut writer = Writer::new(&mut seq, false); + start_thread.wait(); + if let Some(mut wg) = barrier.enter(&mut writer) { + leader_enter_tx_clone.send(()).unwrap(); + let mut idx = 0; + for w in wg.iter_mut() { + let p = *w.mut_payload(); + w.set_output(p); + idx += 1; + } + assert_eq!(idx, n as u32); + leader_exit_tx.send(()).unwrap(); + } + assert_eq!(writer.finish(), seq); + }) + .unwrap(), + ); + } + start_thread.wait(); + std::thread::sleep(Duration::from_millis(100)); + // unblock current leader + self.leader_exit_rx.recv().unwrap(); + for th in self.ths.drain(0..prev_writers) { + th.join().unwrap(); + } + // make sure new leader is ready + leader_enter_rx.recv().unwrap(); + } + + fn join(&mut self) { + self.leader_exit_rx.recv().unwrap(); + for th in self.ths.drain(..) { + th.join().unwrap(); + } + } + } + + #[test] + fn test_parallel_groups() { + let mut ctx = ConcurrentWriteContext::new(); + for i in 1..5 { + ctx.step(i); + } + ctx.join(); + } +} diff --git a/third/raft-engine/tests/benches/bench_recovery.rs b/third/raft-engine/tests/benches/bench_recovery.rs new file mode 100644 index 00000000..55b42c22 --- /dev/null +++ b/third/raft-engine/tests/benches/bench_recovery.rs @@ -0,0 +1,186 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use criterion::{criterion_group, BenchmarkId, Criterion}; +use raft::eraftpb::Entry; +use raft_engine::ReadableSize; +use raft_engine::{Config as EngineConfig, Engine, LogBatch, MessageExt, Result}; +use rand::{Rng, SeedableRng}; +use std::collections::HashMap; +use std::fmt; +use std::path::PathBuf; +use tempfile::TempDir; + +#[derive(Clone)] +struct MessageExtTyped; +impl MessageExt for MessageExtTyped { + type Entry = Entry; + + fn index(entry: &Entry) -> u64 { + entry.index + } +} + +struct Config { + total_size: ReadableSize, + region_count: u64, + batch_size: ReadableSize, + item_size: ReadableSize, + entry_size: ReadableSize, + batch_compression_threshold: ReadableSize, +} + +impl Default for Config { + fn default() -> Self { + Self { + total_size: ReadableSize::gb(1), + region_count: 100, + batch_size: ReadableSize::mb(1), + item_size: ReadableSize::kb(1), + entry_size: ReadableSize(256), + batch_compression_threshold: ReadableSize(0), + } + } +} + +impl fmt::Display for Config { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{} [region-count: {}][batch-size: {}][item-size: {}][entry-size: {}][batch-compression-threshold: {}]", + self.total_size, + self.region_count, + self.batch_size, + self.item_size, + self.entry_size, + self.batch_compression_threshold, + ) + } +} + +fn generate(cfg: &Config) -> Result { + let dir = tempfile::Builder::new().prefix("bench").tempdir().unwrap(); + let path = dir.path().to_str().unwrap().to_owned(); + let mut rng = rand::rngs::StdRng::seed_from_u64(0); + + let ecfg = EngineConfig { + dir: path.clone(), + batch_compression_threshold: cfg.batch_compression_threshold, + ..Default::default() + }; + + let engine = Engine::open(ecfg).unwrap(); + + let mut indexes: HashMap = (1..cfg.region_count + 1).map(|rid| (rid, 0)).collect(); + while dir_size(&path).0 < cfg.total_size.0 { + let mut batch = LogBatch::default(); + while batch.approximate_size() < cfg.batch_size.0 as usize { + let region_id = rng.gen_range(1..cfg.region_count + 1); + let mut item_size = 0; + let mut entries = vec![]; + while item_size < cfg.item_size.0 { + entries.push(Entry { + data: (&mut rng) + .sample_iter(rand::distributions::Standard) + .take(cfg.entry_size.0 as usize) + .collect::>() + .into(), + ..Default::default() + }); + item_size += cfg.entry_size.0; + } + let mut index = *indexes.get(®ion_id).unwrap(); + index = entries.iter_mut().fold(index, |index, e| { + e.index = index + 1; + index + 1 + }); + *indexes.get_mut(®ion_id).unwrap() = index; + batch + .add_entries::(region_id, &entries) + .unwrap(); + } + engine.write(&mut batch, false).unwrap(); + } + engine.sync().unwrap(); + drop(engine); + Ok(dir) +} + +fn dir_size(path: &str) -> ReadableSize { + ReadableSize( + std::fs::read_dir(PathBuf::from(path)) + .unwrap() + .map(|entry| std::fs::metadata(entry.unwrap().path()).unwrap().len()) + .sum(), + ) +} + +// Benchmarks + +fn bench_recovery(c: &mut Criterion) { + // prepare input + let cfgs = vec![ + ( + "default".to_owned(), + Config { + ..Default::default() + }, + ), + ( + "compressed".to_owned(), + Config { + batch_compression_threshold: ReadableSize::kb(8), + ..Default::default() + }, + ), + ( + "small-batch(1KB)".to_owned(), + Config { + region_count: 100, + batch_size: ReadableSize::kb(1), + item_size: ReadableSize(256), + entry_size: ReadableSize(32), + ..Default::default() + }, + ), + ( + "10GB".to_owned(), + Config { + total_size: ReadableSize::gb(10), + region_count: 1000, + ..Default::default() + }, + ), + ]; + + for (i, (name, cfg)) in cfgs.iter().enumerate() { + println!("config-{i}: [{name}] {cfg}"); + } + + fail::cfg("log_fd::open::fadvise_dontneed", "return").unwrap(); + for (i, (name, cfg)) in cfgs.iter().enumerate() { + let dir = generate(cfg).unwrap(); + let path = dir.path().to_str().unwrap().to_owned(); + let ecfg = EngineConfig { + dir: path.clone(), + batch_compression_threshold: cfg.batch_compression_threshold, + ..Default::default() + }; + c.bench_with_input( + BenchmarkId::new( + "Engine::open", + format!("size:{} config-{}: {}", dir_size(&path), i, name), + ), + &ecfg, + |b, cfg| { + b.iter(|| { + Engine::open(cfg.to_owned()).unwrap(); + }) + }, + ); + } + fail::remove("log_fd::open::fadvise_dontneed"); +} + +criterion_group! { + name = benches; + config = Criterion::default().sample_size(10); + targets = bench_recovery +} diff --git a/third/raft-engine/tests/benches/mod.rs b/third/raft-engine/tests/benches/mod.rs new file mode 100644 index 00000000..1519c685 --- /dev/null +++ b/third/raft-engine/tests/benches/mod.rs @@ -0,0 +1,9 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +extern crate libc; + +use criterion::criterion_main; + +mod bench_recovery; + +criterion_main!(bench_recovery::benches); diff --git a/third/raft-engine/tests/failpoints/mod.rs b/third/raft-engine/tests/failpoints/mod.rs new file mode 100644 index 00000000..47ea42a4 --- /dev/null +++ b/third/raft-engine/tests/failpoints/mod.rs @@ -0,0 +1,44 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +#![cfg_attr(feature = "swap", feature(allocator_api))] + +mod util; + +mod test_engine; +mod test_io_error; + +use fail::FailGuard; +use raft_engine::*; +use util::*; + +#[ctor::ctor] +fn init() { + env_logger::init(); +} + +#[test] +fn test_log_batch_full() { + let _f = FailGuard::new("log_batch::1kb_entries_size_per_batch", "return"); + let mut batch_1 = LogBatch::default(); + let mut batch_2 = LogBatch::default(); + let data = vec![b'x'; 800]; + let entries = generate_entries(1, 2, Some(&data)); + batch_1.add_entries::(1, &entries).unwrap(); + batch_2.add_entries::(2, &entries).unwrap(); + + let mut batch_1_clone = batch_1.clone(); + let mut batch_2_clone = batch_2.clone(); + assert!(matches!( + batch_1_clone.merge(&mut batch_2_clone), + Err(Error::Full) + )); + assert_eq!(batch_1, batch_1_clone); + assert_eq!(batch_2, batch_2_clone); + + let mut batch_1_clone = batch_1.clone(); + assert!(matches!( + batch_1_clone.add_entries::(3, &entries), + Err(Error::Full) + )); + assert_eq!(batch_1, batch_1_clone); +} diff --git a/third/raft-engine/tests/failpoints/test_engine.rs b/third/raft-engine/tests/failpoints/test_engine.rs new file mode 100644 index 00000000..8d74b70b --- /dev/null +++ b/third/raft-engine/tests/failpoints/test_engine.rs @@ -0,0 +1,1188 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; +use std::sync::{Arc, Barrier}; +use std::time::Duration; + +use fail::FailGuard; +use kvproto::raft_serverpb::RaftLocalState; +use raft::eraftpb::Entry; +use raft_engine::env::{FileSystem, ObfuscatedFileSystem}; +use raft_engine::internals::*; +use raft_engine::*; + +use crate::util::*; + +fn append( + engine: &Engine, + rid: u64, + start_index: u64, + end_index: u64, + data: Option<&[u8]>, +) { + let entries = generate_entries(start_index, end_index, data); + if !entries.is_empty() { + let mut batch = LogBatch::default(); + batch.add_entries::(rid, &entries).unwrap(); + batch + .put_message( + rid, + b"last_index".to_vec(), + &RaftLocalState { + last_index: entries[entries.len() - 1].index, + ..Default::default() + }, + ) + .unwrap(); + engine.write(&mut batch, true).unwrap(); + } +} + +#[test] +fn test_pipe_log_listeners() { + use std::collections::HashMap; + + #[derive(Default)] + struct QueueHook { + files: AtomicUsize, + appends: AtomicUsize, + applys: AtomicUsize, + purged: AtomicU64, + } + + impl QueueHook { + fn files(&self) -> usize { + self.files.load(Ordering::Acquire) + } + fn appends(&self) -> usize { + self.appends.load(Ordering::Acquire) + } + fn applys(&self) -> usize { + self.applys.load(Ordering::Acquire) + } + fn purged(&self) -> u64 { + self.purged.load(Ordering::Acquire) + } + } + + struct Hook(HashMap); + impl Default for Hook { + fn default() -> Hook { + let mut hash = HashMap::default(); + hash.insert(LogQueue::Append, QueueHook::default()); + hash.insert(LogQueue::Rewrite, QueueHook::default()); + Hook(hash) + } + } + + impl EventListener for Hook { + fn post_new_log_file(&self, id: FileId) { + self.0[&id.queue].files.fetch_add(1, Ordering::Release); + } + + fn on_append_log_file(&self, handle: FileBlockHandle) { + self.0[&handle.id.queue] + .appends + .fetch_add(1, Ordering::Release); + } + + fn post_apply_memtables(&self, id: FileId) { + self.0[&id.queue].applys.fetch_add(1, Ordering::Release); + } + + fn post_purge(&self, id: FileId) { + self.0[&id.queue].purged.store(id.seq, Ordering::Release); + } + } + + let dir = tempfile::Builder::new() + .prefix("test_pipe_log_listeners") + .tempdir() + .unwrap(); + + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize::kb(128), + purge_threshold: ReadableSize::kb(512), + batch_compression_threshold: ReadableSize::kb(0), + ..Default::default() + }; + + let hook = Arc::new(Hook::default()); + let engine = Arc::new(Engine::open_with_listeners(cfg.clone(), vec![hook.clone()]).unwrap()); + assert_eq!(hook.0[&LogQueue::Append].files(), 1); + assert_eq!(hook.0[&LogQueue::Rewrite].files(), 1); + + let data = vec![b'x'; 64 * 1024]; + + // Append 10 logs for region 1, 10 logs for region 2. + for i in 1..=20 { + let region_id = (i as u64 - 1) % 2 + 1; + append( + &engine, + region_id, + (i as u64 + 1) / 2, + (i as u64 + 1) / 2 + 1, + Some(&data), + ); + assert_eq!(hook.0[&LogQueue::Append].appends(), i); + assert_eq!(hook.0[&LogQueue::Append].applys(), i); + } + assert_eq!(hook.0[&LogQueue::Append].files(), 10); + + engine.purge_expired_files().unwrap(); + assert_eq!(hook.0[&LogQueue::Append].purged(), 8); + + let rewrite_files = hook.0[&LogQueue::Rewrite].files(); + + // Append 5 logs for region 1, 5 logs for region 2. + for i in 21..=30 { + let region_id = (i as u64 - 1) % 2 + 1; + append( + &engine, + region_id, + (i as u64 + 1) / 2, + (i as u64 + 1) / 2 + 1, + Some(&data), + ); + assert_eq!(hook.0[&LogQueue::Append].appends(), i); + assert_eq!(hook.0[&LogQueue::Append].applys(), i); + } + // Compact so that almost all content of rewrite queue will become garbage. + engine.compact_to(1, 14); + engine.compact_to(2, 14); + assert_eq!(hook.0[&LogQueue::Append].appends(), 32); + assert_eq!(hook.0[&LogQueue::Append].applys(), 32); + + engine.purge_expired_files().unwrap(); + assert_eq!(hook.0[&LogQueue::Append].purged(), 14); + assert_eq!(hook.0[&LogQueue::Rewrite].purged(), rewrite_files as u64); + + // Write region 3 without applying. + let apply_memtable_region_3_fp = "memtable_accessor::apply_append_writes::region_3"; + fail::cfg(apply_memtable_region_3_fp, "pause").unwrap(); + let engine_clone = engine.clone(); + let data_clone = data.clone(); + let th = std::thread::spawn(move || { + append(&engine_clone, 3, 1, 2, Some(&data_clone)); + }); + + // Sleep a while to wait the log batch `Append(3, [1])` to get written. + std::thread::sleep(Duration::from_millis(200)); + assert_eq!(hook.0[&LogQueue::Append].appends(), 33); + let file_not_applied = engine.file_span(LogQueue::Append).1; + assert_eq!(hook.0[&LogQueue::Append].applys(), 32); + + for i in 31..=40 { + let region_id = (i as u64 - 1) % 2 + 1; + append( + &engine, + region_id, + (i as u64 + 1) / 2, + (i as u64 + 1) / 2 + 1, + Some(&data), + ); + assert_eq!(hook.0[&LogQueue::Append].appends(), i + 3); + assert_eq!(hook.0[&LogQueue::Append].applys(), i + 2); + } + + // Can't purge because region 3 is not yet applied. + engine.purge_expired_files().unwrap(); + let first = engine.file_span(LogQueue::Append).0; + assert_eq!(file_not_applied, first); + + // Resume write on region 3. + fail::remove(apply_memtable_region_3_fp); + th.join().unwrap(); + + std::thread::sleep(Duration::from_millis(200)); + engine.purge_expired_files().unwrap(); + let new_first = engine.file_span(LogQueue::Append).0; + assert_ne!(file_not_applied, new_first); + + // Drop and then recover. + drop(engine); + + let hook = Arc::new(Hook::default()); + let engine = Engine::open_with_listeners(cfg, vec![hook.clone()]).unwrap(); + assert_eq!( + hook.0[&LogQueue::Append].files() as u64, + engine.file_span(LogQueue::Append).1 - engine.file_span(LogQueue::Append).0 + 1 + ); + assert_eq!( + hook.0[&LogQueue::Rewrite].files() as u64, + engine.file_span(LogQueue::Rewrite).1 - engine.file_span(LogQueue::Rewrite).0 + 1 + ); +} + +#[test] +fn test_concurrent_write_empty_log_batch() { + let dir = tempfile::Builder::new() + .prefix("test_concurrent_write_empty_log_batch") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + ..Default::default() + }; + let engine = Arc::new(Engine::open(cfg.clone()).unwrap()); + let mut ctx = ConcurrentWriteContext::new(engine.clone()); + + let some_entries = vec![ + Entry::new(), + Entry { + index: 1, + ..Default::default() + }, + ]; + + ctx.write(LogBatch::default()); + let mut log_batch = LogBatch::default(); + log_batch + .add_entries::(1, &some_entries) + .unwrap(); + ctx.write(log_batch); + ctx.join(); + + let mut log_batch = LogBatch::default(); + log_batch + .add_entries::(2, &some_entries) + .unwrap(); + ctx.write(log_batch); + ctx.write(LogBatch::default()); + ctx.join(); + drop(ctx); + drop(engine); + + let engine = Engine::open(cfg).unwrap(); + let mut entries = Vec::new(); + engine + .fetch_entries_to::( + 1, /* region */ + 0, /* begin */ + 2, /* end */ + None, /* max_size */ + &mut entries, + ) + .unwrap(); + assert_eq!(entries, some_entries); + entries.clear(); + engine + .fetch_entries_to::( + 2, /* region */ + 0, /* begin */ + 2, /* end */ + None, /* max_size */ + &mut entries, + ) + .unwrap(); + assert_eq!(entries, some_entries); +} + +#[test] +fn test_consistency_tools() { + let dir = tempfile::Builder::new() + .prefix("test_consistency_tools") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(128), + ..Default::default() + }; + let engine = Arc::new(Engine::open(cfg.clone()).unwrap()); + let data = vec![b'x'; 128]; + for index in 1..=100 { + for rid in 1..=10 { + let _f = if index == rid * rid { + Some(FailGuard::new("log_batch::corrupted_items", "return")) + } else { + None + }; + append(&engine, rid, index, index + 1, Some(&data)); + } + } + drop(engine); + assert!(Engine::open(cfg.clone()).is_err()); + + let ids = Engine::consistency_check(dir.path()).unwrap(); + for (id, index) in ids.iter() { + assert_eq!(id * id, index + 1); + } + + // Panic instead of err because `consistency_check` also removes corruptions. + assert!(catch_unwind_silent(|| Engine::open(cfg.clone())).is_err()); +} + +#[cfg(feature = "scripting")] +#[test] +fn test_repair_tool() { + let dir = tempfile::Builder::new() + .prefix("test_repair_tool") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(128), + ..Default::default() + }; + let engine = Arc::new(Engine::open(cfg.clone()).unwrap()); + let data = vec![b'x'; 128]; + for index in 1..=100 { + for rid in 1..=10 { + let _f = if index == rid * rid { + Some(FailGuard::new("log_batch::corrupted_items", "return")) + } else { + None + }; + append(&engine, rid, index, index + 1, Some(&data)); + } + } + drop(engine); + + assert!(Engine::open(cfg.clone()).is_err()); + let script = "".to_owned(); + assert!(Engine::unsafe_repair(dir.path(), None, script).is_err()); + let script = " + fn filter_append(id, first, count, rewrite_count, queue, ifirst, ilast) { + if first + count < ifirst { + return 2; // discard existing + } + 0 // default + } + " + .to_owned(); + Engine::unsafe_repair(dir.path(), None, script).unwrap(); + Engine::open(cfg).unwrap(); +} + +#[test] +fn test_incomplete_purge() { + let dir = tempfile::Builder::new() + .prefix("test_incomplete_purge") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(1), + purge_threshold: ReadableSize(1), + ..Default::default() + }; + let rid = 1; + let data = vec![b'7'; 1024]; + + let engine = Engine::open(cfg.clone()).unwrap(); + + { + let _f = FailGuard::new("default_fs::delete_skipped", "return"); + append(&engine, rid, 0, 20, Some(&data)); + let append_first = engine.file_span(LogQueue::Append).0; + engine.compact_to(rid, 18); + engine.purge_expired_files().unwrap(); + assert!(engine.file_span(LogQueue::Append).0 > append_first); + } + + // Create a hole. + append(&engine, rid, 20, 40, Some(&data)); + let append_first = engine.file_span(LogQueue::Append).0; + engine.compact_to(rid, 38); + engine.purge_expired_files().unwrap(); + assert!(engine.file_span(LogQueue::Append).0 > append_first); + + append(&engine, rid, 40, 60, Some(&data)); + let append_first = engine.file_span(LogQueue::Append).0; + drop(engine); + + let engine = Engine::open(cfg).unwrap(); + assert_eq!(engine.file_span(LogQueue::Append).0, append_first); + assert_eq!(engine.first_index(rid).unwrap(), 38); + assert_eq!(engine.last_index(rid).unwrap(), 59); +} + +#[test] +fn test_tail_corruption() { + let data = vec![b'x'; 16]; + let fs = Arc::new(ObfuscatedFileSystem::default()); + let rid = 1; + // Header is correct, record is corrupted. + { + let dir = tempfile::Builder::new() + .prefix("test_tail_corruption_1") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + format_version: Version::V2, + ..Default::default() + }; + let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + let _f = FailGuard::new("log_batch::corrupted_items", "return"); + append(&engine, rid, 1, 5, Some(&data)); + drop(engine); + let engine = Engine::open_with_file_system(cfg, fs.clone()).unwrap(); + assert_eq!(engine.first_index(rid), None); + } + // Tail entries block is corrupted. + { + let dir = tempfile::Builder::new() + .prefix("test_tail_corruption_1_1") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + format_version: Version::V2, + ..Default::default() + }; + let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + let _f = FailGuard::new("log_batch::corrupted_entries", "return"); + append(&engine, rid, 1, 5, Some(&data)); + drop(engine); + let engine = Engine::open_with_file_system(cfg, fs.clone()).unwrap(); + assert_eq!(engine.first_index(rid), None); + } + // Repeat with absolute consistency. + { + let dir = tempfile::Builder::new() + .prefix("test_tail_corruption_1_2") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + format_version: Version::V2, + recovery_mode: RecoveryMode::AbsoluteConsistency, + ..Default::default() + }; + let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + let _f = FailGuard::new("log_batch::corrupted_entries", "return"); + append(&engine, rid, 1, 5, Some(&data)); + drop(engine); + assert!(Engine::open_with_file_system(cfg, fs.clone()).is_err()); + } + // Header is corrupted. + { + let _f = FailGuard::new("log_file_header::corrupted", "return"); + let dir = tempfile::Builder::new() + .prefix("test_tail_corruption_2") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + ..Default::default() + }; + let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + drop(engine); + Engine::open_with_file_system(cfg, fs.clone()).unwrap(); + } + // Header is corrupted, followed by some records. + { + let _f = FailGuard::new("log_file_header::corrupted", "return"); + let dir = tempfile::Builder::new() + .prefix("test_tail_corruption_3") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + ..Default::default() + }; + let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + append(&engine, rid, 1, 5, Some(&data)); + drop(engine); + Engine::open_with_file_system(cfg, fs.clone()).unwrap(); + } + // Version::V1 in header owns abnormal DataLayout. + { + let _f = FailGuard::new("log_file_header::too_large", "return"); + let dir = tempfile::Builder::new() + .prefix("test_tail_corruption_4") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(1), + purge_threshold: ReadableSize(1), + ..Default::default() + }; + let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + drop(engine); + // Version::V1 will be parsed successfully as the data_layout when the related + // `version == V1` will be ignored. + let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + append(&engine, rid, 1, 5, Some(&data)); + drop(engine); + Engine::open_with_file_system(cfg, fs.clone()).unwrap(); + } + // DataLayout in header is corrupted for Version::V2 + { + let _f = FailGuard::new("log_file_header::too_small", "return"); + let dir = tempfile::Builder::new() + .prefix("test_tail_corruption_5") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + format_version: Version::V2, + ..Default::default() + }; + let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + drop(engine); + Engine::open_with_file_system(cfg, fs.clone()).unwrap(); + } + // DataLayout in header is abnormal for Version::V2 + { + let _f = FailGuard::new("log_file_header::too_large", "return"); + let dir = tempfile::Builder::new() + .prefix("test_tail_corruption_6") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + format_version: Version::V2, + ..Default::default() + }; + let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + drop(engine); + Engine::open_with_file_system(cfg, fs.clone()).unwrap(); + } + // DataLayout in header is corrupted for Version::V2, followed with records + { + let _f = FailGuard::new("log_file_header::too_small", "return"); + let dir = tempfile::Builder::new() + .prefix("test_tail_corruption_7") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(1), + purge_threshold: ReadableSize(1), + format_version: Version::V2, + ..Default::default() + }; + let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + drop(engine); + let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + append(&engine, rid, 1, 2, Some(&data)); + append(&engine, rid, 2, 3, Some(&data)); + drop(engine); + assert!(Engine::open_with_file_system(cfg, fs).is_err()); + } +} + +#[test] +fn test_concurrent_write_perf_context() { + let dir = tempfile::Builder::new() + .prefix("test_concurrent_write_perf_context") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + ..Default::default() + }; + + let some_entries = vec![ + Entry::new(), + Entry { + index: 1, + ..Default::default() + }, + ]; + + let engine = Arc::new(Engine::open(cfg).unwrap()); + let barrier = Arc::new(Barrier::new(4)); + + let ths: Vec<_> = (1..=3) + .map(|i| { + let engine = engine.clone(); + let barrier = barrier.clone(); + let some_entries = some_entries.clone(); + std::thread::spawn(move || { + barrier.wait(); + let mut log_batch = LogBatch::default(); + log_batch + .add_entries::(i, &some_entries) + .unwrap(); + let old_perf_context = get_perf_context(); + engine.write(&mut log_batch, true).unwrap(); + let new_perf_context = get_perf_context(); + (old_perf_context, new_perf_context) + }) + }) + .collect(); + + fail::cfg_callback("write_barrier::leader_exit", move || { + barrier.wait(); + // Sleep a while until new writers enter the next write group. + std::thread::sleep(Duration::from_millis(100)); + fail::remove("write_barrier::leader_exit"); + }) + .unwrap(); + + let mut log_batch = LogBatch::default(); + log_batch + .add_entries::(4, &some_entries) + .unwrap(); + engine.write(&mut log_batch, true).unwrap(); + + for th in ths { + let (old, new) = th.join().unwrap(); + assert_ne!(old.log_populating_duration, new.log_populating_duration); + assert_ne!(old.write_wait_duration, new.write_wait_duration); + assert_ne!(old.log_write_duration, new.log_write_duration); + assert_ne!(old.apply_duration, new.apply_duration); + } +} + +// FIXME: this test no longer works because recovery cannot reliably detect +// overwrite anomaly. +// See https://github.com/tikv/raft-engine/issues/250 +#[test] +#[should_panic] +fn test_recycle_with_stale_logbatch_at_tail() { + let dir = tempfile::Builder::new() + .prefix("test_recycle_with_stale_log_batch_at_tail") + .tempdir() + .unwrap(); + let data = vec![b'x'; 1024]; + let rid = 1; + let cfg_err = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize::kb(2), + purge_threshold: ReadableSize::kb(4), + enable_log_recycle: true, + format_version: Version::V1, + ..Default::default() + }; + // Force open Engine with `enable_log_recycle == true` and + // `format_version == Version::V1`. + let engine = { + let _f = FailGuard::new("pipe_log::version::force_enable_log_signing", "return"); + Engine::open(cfg_err.clone()).unwrap() + }; + // Do not truncate the active_file when exit + let _f = FailGuard::new("file_pipe_log::log_file_writer::skip_truncate", "return"); + assert_eq!(cfg_err.format_version, Version::V1); + append(&engine, rid, 1, 2, Some(&data)); // file_seq: 1 + append(&engine, rid, 2, 3, Some(&data)); + append(&engine, rid, 3, 4, Some(&data)); // file_seq: 2 + append(&engine, rid, 4, 5, Some(&data)); + append(&engine, rid, 5, 6, Some(&data)); // file_seq: 3 + let append_first = engine.file_span(LogQueue::Append).0; + engine.compact_to(rid, 3); + engine.purge_expired_files().unwrap(); + assert!(engine.file_span(LogQueue::Append).0 > append_first); + // append, written into seq: 3 + append(&engine, rid, 4, 5, Some(&data)); + // recycle, written into seq: 1 + append(&engine, rid, 5, 6, Some(&data)); + drop(engine); + // Recover the engine with invalid Version::default(). + // Causing the final log file is a recycled file, containing rewritten + // LogBatchs and end with stale LogBatchs, `Engine::open(...)` should + // `panic` when recovering the relate `Memtable`. + assert!(catch_unwind_silent(|| { + let cfg_v2 = Config { + format_version: Version::V2, + ..cfg_err + }; + Engine::open(cfg_v2) + }) + .is_err()); +} + +#[test] +fn test_build_engine_with_multi_datalayout() { + let dir = tempfile::Builder::new() + .prefix("test_build_engine_with_multi_datalayout") + .tempdir() + .unwrap(); + let data = vec![b'x'; 12827]; + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize::kb(2), + purge_threshold: ReadableSize::kb(4), + recovery_mode: RecoveryMode::AbsoluteConsistency, + ..Default::default() + }; + // Defaultly, File with DataLayout::NoAlignment. + let engine = Engine::open(cfg.clone()).unwrap(); + for rid in 1..=3 { + append(&engine, rid, 1, 11, Some(&data)); + } + drop(engine); + // File with DataLayout::Alignment + let _f = FailGuard::new("file_pipe_log::open::force_set_alignment", "return"); + let cfg_v2 = Config { + format_version: Version::V2, + ..cfg + }; + let engine = Engine::open(cfg_v2.clone()).unwrap(); + for rid in 1..=3 { + append(&engine, rid, 11, 20, Some(&data)); + } + drop(engine); + Engine::open(cfg_v2).unwrap(); +} + +#[test] +fn test_build_engine_with_datalayout_abnormal() { + let dir = tempfile::Builder::new() + .prefix("test_build_engine_with_datalayout_abnormal") + .tempdir() + .unwrap(); + let data = vec![b'x'; 1024]; + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize::kb(2), + purge_threshold: ReadableSize::kb(4), + recovery_mode: RecoveryMode::AbsoluteConsistency, + format_version: Version::V2, + ..Default::default() + }; + let _f = FailGuard::new("file_pipe_log::open::force_set_alignment", "return"); + let engine = Engine::open(cfg.clone()).unwrap(); + // Content durable with DataLayout::Alignment. + append(&engine, 1, 1, 11, Some(&data)); + append(&engine, 2, 1, 11, Some(&data)); + { + // Set failpoint to dump content with invalid paddings into log file. + let _f1 = FailGuard::new("file_pipe_log::append::corrupted_padding", "return"); + append(&engine, 3, 1, 11, Some(&data)); + drop(engine); + assert!(Engine::open(cfg.clone()).is_err()); + } + { + // Reopen the Engine with TolerateXXX mode. + let mut cfg_v2 = cfg.clone(); + cfg_v2.recovery_mode = RecoveryMode::TolerateTailCorruption; + let engine = Engine::open(cfg_v2).unwrap(); + for rid in 4..=8 { + append(&engine, rid, 1, 11, Some(&data)); + } + drop(engine); + Engine::open(cfg).unwrap(); + } +} + +// issue-228 +#[test] +fn test_partial_rewrite_rewrite() { + let dir = tempfile::Builder::new() + .prefix("test_partial_rewrite_rewrite") + .tempdir() + .unwrap(); + let _f = FailGuard::new("max_rewrite_batch_bytes", "return(1)"); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + recovery_threads: 1, + ..Default::default() + }; + let engine = Engine::open(cfg.clone()).unwrap(); + let data = vec![b'x'; 128]; + + for rid in 1..=3 { + append(&engine, rid, 1, 5, Some(&data)); + append(&engine, rid, 5, 11, Some(&data)); + } + + let old_active_file = engine.file_span(LogQueue::Append).1; + engine.purge_manager().must_rewrite_append_queue(None, None); + assert_eq!(engine.file_span(LogQueue::Append).0, old_active_file + 1); + + for rid in 1..=3 { + append(&engine, rid, 11, 16, Some(&data)); + } + + { + let _f = FailGuard::new("log_file::write::err", "10*off->return->off"); + assert!( + catch_unwind_silent(|| engine.purge_manager().must_rewrite_rewrite_queue()).is_err() + ); + } + + drop(engine); + let engine = Engine::open(cfg).unwrap(); + for rid in 1..=3 { + assert_eq!(engine.first_index(rid).unwrap(), 1); + assert_eq!(engine.last_index(rid).unwrap(), 15); + } +} + +#[test] +fn test_partial_rewrite_rewrite_online() { + let dir = tempfile::Builder::new() + .prefix("test_partial_rewrite_rewrite_online") + .tempdir() + .unwrap(); + let _f = FailGuard::new("max_rewrite_batch_bytes", "return(1)"); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + ..Default::default() + }; + let engine = Engine::open(cfg.clone()).unwrap(); + let data = vec![b'x'; 128]; + + for rid in 1..=3 { + append(&engine, rid, 1, 5, Some(&data)); + append(&engine, rid, 5, 11, Some(&data)); + } + + let old_active_file = engine.file_span(LogQueue::Append).1; + engine.purge_manager().must_rewrite_append_queue(None, None); + assert_eq!(engine.file_span(LogQueue::Append).0, old_active_file + 1); + + { + let _f = FailGuard::new("log_file::write::err", "10*off->return->off"); + assert!( + catch_unwind_silent(|| engine.purge_manager().must_rewrite_rewrite_queue()).is_err() + ); + } + + for rid in 1..=3 { + append(&engine, rid, 11, 16, Some(&data)); + } + let old_active_file = engine.file_span(LogQueue::Append).1; + engine.purge_manager().must_rewrite_append_queue(None, None); + assert_eq!(engine.file_span(LogQueue::Append).0, old_active_file + 1); + + drop(engine); + let engine = Engine::open(cfg).unwrap(); + for rid in 1..=3 { + assert_eq!(engine.first_index(rid).unwrap(), 1); + assert_eq!(engine.last_index(rid).unwrap(), 15); + } +} + +fn test_split_rewrite_batch_imp(regions: u64, region_size: u64, split_size: u64, file_size: u64) { + let dir = tempfile::Builder::new() + .prefix("test_split_rewrite_batch") + .tempdir() + .unwrap(); + let _f1 = FailGuard::new("max_rewrite_batch_bytes", &format!("return({split_size})")); + let _f2 = FailGuard::new("force_use_atomic_group", "return"); + + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(file_size), + batch_compression_threshold: ReadableSize(0), + ..Default::default() + }; + let engine = Engine::open(cfg.clone()).unwrap(); + let data = vec![b'x'; region_size as usize / 10]; + + for rid in 1..=regions { + append(&engine, rid, 1, 5, Some(&data)); + append(&engine, rid, 5, 11, Some(&data)); + } + + let old_active_file = engine.file_span(LogQueue::Append).1; + engine.purge_manager().must_rewrite_append_queue(None, None); + assert_eq!(engine.file_span(LogQueue::Append).0, old_active_file + 1); + + drop(engine); + let engine = Engine::open(cfg.clone()).unwrap(); + for rid in 1..=regions { + assert_eq!(engine.first_index(rid).unwrap(), 1); + assert_eq!(engine.last_index(rid).unwrap(), 10); + } + + for rid in 1..=regions { + append(&engine, rid, 11, 16, Some(&data)); + } + let old_active_file = engine.file_span(LogQueue::Append).1; + engine.purge_manager().must_rewrite_append_queue(None, None); + assert_eq!(engine.file_span(LogQueue::Append).0, old_active_file + 1); + drop(engine); + + for i in 1..=10 { + let engine = Engine::open(cfg.clone()).unwrap(); + let count = AtomicU64::new(0); + fail::cfg_callback("atomic_group::begin", move || { + if count.fetch_add(1, Ordering::Relaxed) + 1 == i { + fail::cfg("log_file::write::err", "return").unwrap(); + } + }) + .unwrap(); + let r = catch_unwind_silent(|| engine.purge_manager().must_rewrite_rewrite_queue()); + fail::remove("atomic_group::begin"); + fail::remove("log_file::write::err"); + if r.is_ok() { + break; + } + } + for i in 1..=10 { + let engine = Engine::open(cfg.clone()).unwrap(); + for rid in 1..=regions { + assert_eq!(engine.first_index(rid).unwrap(), 1); + assert_eq!(engine.last_index(rid).unwrap(), 15); + } + let count = AtomicU64::new(0); + fail::cfg_callback("atomic_group::add", move || { + if count.fetch_add(1, Ordering::Relaxed) + 1 == i { + fail::cfg("log_file::write::err", "return").unwrap(); + } + }) + .unwrap(); + let r = catch_unwind_silent(|| engine.purge_manager().must_rewrite_rewrite_queue()); + fail::remove("atomic_group::add"); + fail::remove("log_file::write::err"); + if r.is_ok() { + break; + } + } + let engine = Engine::open(cfg).unwrap(); + for rid in 1..=regions { + assert_eq!(engine.first_index(rid).unwrap(), 1); + assert_eq!(engine.last_index(rid).unwrap(), 15); + } +} + +#[test] +fn test_split_rewrite_batch() { + test_split_rewrite_batch_imp(10, 40960, 1, 1); + test_split_rewrite_batch_imp(10, 40960, 1, 40960 * 2); + test_split_rewrite_batch_imp(25, 4096, 6000, 40960 * 2); +} + +#[test] +fn test_split_rewrite_batch_with_only_kvs() { + let dir = tempfile::Builder::new() + .prefix("test_split_rewrite_batch_with_only_kvs") + .tempdir() + .unwrap(); + let _f = FailGuard::new("max_rewrite_batch_bytes", "return(1)"); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + ..Default::default() + }; + let engine = Engine::open(cfg.clone()).unwrap(); + let mut log_batch = LogBatch::default(); + let key = vec![b'x'; 2]; + let value = vec![b'y'; 8]; + + let mut rid = 1; + { + log_batch.put(rid, key.clone(), Vec::new()).unwrap(); + engine.write(&mut log_batch, false).unwrap(); + engine.purge_manager().must_rewrite_append_queue(None, None); + + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + engine.write(&mut log_batch, false).unwrap(); + engine.purge_manager().must_rewrite_append_queue(None, None); + + engine.purge_manager().must_rewrite_rewrite_queue(); + + rid += 1; + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + rid += 1; + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + engine.write(&mut log_batch, false).unwrap(); + engine.purge_manager().must_rewrite_append_queue(None, None); + + engine.purge_manager().must_rewrite_rewrite_queue(); + } + { + let _f = FailGuard::new("force_use_atomic_group", "return"); + log_batch.put(rid, key.clone(), Vec::new()).unwrap(); + engine.write(&mut log_batch, false).unwrap(); + engine.purge_manager().must_rewrite_append_queue(None, None); + + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + engine.write(&mut log_batch, false).unwrap(); + engine.purge_manager().must_rewrite_append_queue(None, None); + + engine.purge_manager().must_rewrite_rewrite_queue(); + + rid += 1; + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + rid += 1; + log_batch.put(rid, key.clone(), value.clone()).unwrap(); + engine.write(&mut log_batch, false).unwrap(); + engine.purge_manager().must_rewrite_append_queue(None, None); + + engine.purge_manager().must_rewrite_rewrite_queue(); + } + + drop(engine); + let engine = Engine::open(cfg).unwrap(); + for i in 1..=rid { + assert_eq!(engine.get(i, &key).unwrap(), value); + } +} + +// issue-315 +#[test] +fn test_split_rewrite_batch_then_delete_some() { + let dir = tempfile::Builder::new() + .prefix("test_split_rewrite_batch_then_delete_some") + .tempdir() + .unwrap(); + let _f = FailGuard::new("max_rewrite_batch_bytes", "return(1)"); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(1), + ..Default::default() + }; + let mut log_batch = LogBatch::default(); + let value = vec![b'y'; 8]; + + let rid = 1; + let engine = Engine::open(cfg.clone()).unwrap(); + for i in 0..=5 { + append(&engine, rid, i * 2, i * 2 + 2, Some(&value)); + engine.purge_manager().must_rewrite_append_queue(None, None); + } + engine.purge_manager().must_rewrite_rewrite_queue(); + log_batch.add_command(rid, Command::Compact { index: 7 }); + log_batch.delete(rid, b"last_index".to_vec()); + engine.write(&mut log_batch, true).unwrap(); + engine.purge_manager().must_purge_all_stale(); + + drop(engine); + let engine = Engine::open(cfg.clone()).unwrap(); + // The Compact mark is dropped during `must_purge_all_stale`. + assert_eq!(engine.first_index(rid).unwrap(), 0); + assert_eq!(engine.last_index(rid).unwrap(), 11); + + // Removes all rewrite entries. + log_batch.add_command(rid, Command::Compact { index: 100 }); + engine.write(&mut log_batch, false).unwrap(); + append(&engine, rid, 5, 11, Some(&value)); + engine.purge_manager().must_rewrite_append_queue(None, None); + engine.purge_manager().must_purge_all_stale(); + drop(engine); + let engine = Engine::open(cfg).unwrap(); + assert_eq!(engine.first_index(rid).unwrap(), 5); + assert_eq!(engine.last_index(rid).unwrap(), 10); +} + +#[test] +fn test_build_engine_with_recycling_and_multi_dirs() { + let dir = tempfile::Builder::new() + .prefix("test_build_engine_with_multi_dirs_main") + .tempdir() + .unwrap(); + let spill_dir = tempfile::Builder::new() + .prefix("test_build_engine_with_multi_dirs_spill") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + spill_dir: Some(spill_dir.path().to_str().unwrap().to_owned()), + target_file_size: ReadableSize::kb(1), + purge_threshold: ReadableSize::kb(20), + enable_log_recycle: true, + prefill_for_recycle: true, + ..Default::default() + }; + let data = vec![b'x'; 1024]; + { + // Prerequisite - case 1: all disks are full, Engine can be opened normally. + { + // Multi directories. + let _f = FailGuard::new("file_pipe_log::force_choose_dir", "return"); + Engine::open(cfg.clone()).unwrap(); + // Single diretory - spill-dir is None. + let cfg_single_dir = Config { + spill_dir: None, + ..cfg.clone() + }; + Engine::open(cfg_single_dir).unwrap(); + } + // Prerequisite - case 2: all disks are full after writing, and the current + // engine should be available for `read`. + { + let cfg_no_prefill = Config { + prefill_for_recycle: false, + ..cfg.clone() + }; + let engine = Engine::open(cfg_no_prefill.clone()).unwrap(); + engine + .write(&mut generate_batch(101, 11, 21, Some(&data)), true) + .unwrap(); + drop(engine); + let _f1 = FailGuard::new("file_pipe_log::force_choose_dir", "return"); + let _f2 = FailGuard::new("log_fd::write::no_space_err", "return"); + let engine = Engine::open(cfg_no_prefill).unwrap(); + assert_eq!( + 10, + engine + .fetch_entries_to::(101, 11, 21, None, &mut vec![]) + .unwrap() + ); + } + // Prerequisite - case 3: prefill several recycled logs but no space for + // remains, making prefilling progress exit in advance. + { + let _f1 = FailGuard::new( + "file_pipe_log::force_choose_dir", + "10*return(0)->5*return(1)", + ); + let _f2 = FailGuard::new("log_fd::write::no_space_err", "return"); + let _ = Engine::open(cfg.clone()).unwrap(); + } + // Clean-up the env for later testing. + let cfg_err = Config { + enable_log_recycle: false, + prefill_for_recycle: false, + ..cfg.clone() + }; + let _ = Engine::open(cfg_err).unwrap(); + } + { + // Case 1: prefill recycled logs into multi-dirs (when preparing recycled logs, + // this circumstance also equals to `main dir is full, but spill-dir + // is free`.) + let engine = { + let _f = FailGuard::new("file_pipe_log::force_choose_dir", "10*return(0)->return(1)"); + Engine::open(cfg.clone()).unwrap() + }; + for rid in 1..10 { + append(&engine, rid, 1, 5, Some(&data)); + } + let append_first = engine.file_span(LogQueue::Append).0; + for rid in 1..10 { + engine.compact_to(rid, 3); + } + // Purge do not exceed purge_threshold, and first active file_seq won't change. + engine.purge_expired_files().unwrap(); + assert_eq!(engine.file_span(LogQueue::Append).0, append_first); + for rid in 1..20 { + append(&engine, rid, 3, 5, Some(&data)); + engine.compact_to(rid, 4); + } + // Purge obsolete logs. + engine.purge_expired_files().unwrap(); + assert!(engine.file_span(LogQueue::Append).0 > append_first); + } + { + // Case 2: prefill is on but no spare space for new log files. + let _f = FailGuard::new("file_pipe_log::force_choose_dir", "return"); + let engine = Engine::open(cfg.clone()).unwrap(); + let append_end = engine.file_span(LogQueue::Append).1; + // As there still exists several recycled logs for incoming writes, so the + // following writes will success. + for rid in 1..10 { + append(&engine, rid, 5, 7, Some(&data)); + } + assert!(engine.file_span(LogQueue::Append).1 > append_end); + } + { + // Case 3: no prefill and no spare space for new log files. + let cfg_no_prefill = Config { + enable_log_recycle: true, + prefill_for_recycle: false, + ..cfg + }; + let _f1 = FailGuard::new("file_pipe_log::force_choose_dir", "return"); + let engine = Engine::open(cfg_no_prefill).unwrap(); + let _f2 = FailGuard::new("log_fd::write::no_space_err", "return"); + let (append_first, append_end) = engine.file_span(LogQueue::Append); + // Cannot append new data into engine as no spare space. + for rid in 1..20 { + assert!(catch_unwind_silent(|| append(&engine, rid, 8, 9, Some(&data))).is_err()); + } + assert_eq!( + engine.file_span(LogQueue::Append), + (append_first, append_end) + ); + } +} diff --git a/third/raft-engine/tests/failpoints/test_io_error.rs b/third/raft-engine/tests/failpoints/test_io_error.rs new file mode 100644 index 00000000..4383d7b8 --- /dev/null +++ b/third/raft-engine/tests/failpoints/test_io_error.rs @@ -0,0 +1,657 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use std::sync::Arc; + +use fail::FailGuard; +use raft::eraftpb::Entry; +use raft_engine::env::ObfuscatedFileSystem; +use raft_engine::internals::*; +use raft_engine::*; + +use crate::util::*; + +#[test] +fn test_file_open_error() { + let dir = tempfile::Builder::new() + .prefix("test_file_open_error") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + ..Default::default() + }; + let fs = Arc::new(ObfuscatedFileSystem::default()); + + { + let _f = FailGuard::new("default_fs::create::err", "return"); + assert!(Engine::open_with_file_system(cfg.clone(), fs.clone()).is_err()); + } + { + let _f = FailGuard::new("default_fs::open::err", "return"); + let _ = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + assert!(Engine::open_with_file_system(cfg, fs).is_err()); + } +} + +#[test] +fn test_file_read_error() { + let dir = tempfile::Builder::new() + .prefix("test_file_read_error") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + ..Default::default() + }; + let fs = Arc::new(ObfuscatedFileSystem::default()); + let entry = vec![b'x'; 1024]; + + let engine = Engine::open_with_file_system(cfg, fs).unwrap(); + // Writing an empty message. + engine + .write(&mut generate_batch(1, 0, 1, None), true) + .unwrap(); + engine + .write(&mut generate_batch(2, 1, 10, Some(&entry)), true) + .unwrap(); + let mut kv_batch = LogBatch::default(); + let entry_value = Entry { + index: 111, + data: entry.to_vec().into(), + ..Default::default() + }; + kv_batch + .put_message(1, b"k".to_vec(), &entry_value) + .unwrap(); + engine.write(&mut kv_batch, true).unwrap(); + + let mut entries = Vec::new(); + let _f = FailGuard::new("log_file::read::err", "return"); + engine + .fetch_entries_to::(1, 0, 1, None, &mut entries) + .unwrap(); + engine.get_message::(1, b"k".as_ref()).unwrap(); + engine + .fetch_entries_to::(2, 1, 10, None, &mut entries) + .unwrap_err(); +} + +#[test] +fn test_file_write_error() { + let dir = tempfile::Builder::new() + .prefix("test_file_write_error") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize::kb(1024), + ..Default::default() + }; + let fs = Arc::new(ObfuscatedFileSystem::default()); + let entry = vec![b'x'; 1024]; + + let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + engine + .write(&mut generate_batch(1, 1, 2, Some(&entry)), false) + .unwrap(); + { + let _f = FailGuard::new("log_file::write::err", "return"); + engine + .write(&mut generate_batch(1, 2, 3, Some(&entry)), false) + .unwrap_err(); + } + { + let _f = FailGuard::new("log_fd::sync::err", "return"); + engine + .write(&mut generate_batch(1, 2, 3, Some(&entry)), false) + .unwrap(); + assert!(catch_unwind_silent(|| { + let _ = engine.write(&mut generate_batch(1, 3, 4, Some(&entry)), true); + }) + .is_err()); + } + + // Internal states are consistent after panics. But outstanding writes are not + // reverted. + engine + .write(&mut generate_batch(2, 1, 2, Some(&entry)), true) + .unwrap(); + drop(engine); + let engine = Engine::open_with_file_system(cfg, fs).unwrap(); + assert_eq!(engine.first_index(1).unwrap(), 1); + assert_eq!(engine.last_index(1).unwrap(), 3); + assert_eq!(engine.first_index(2).unwrap(), 1); + assert_eq!(engine.last_index(2).unwrap(), 1); +} + +fn test_file_rotate_error(restart_after_failure: bool) { + let dir = tempfile::Builder::new() + .prefix("test_file_rotate_error") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize::kb(4), + ..Default::default() + }; + let fs = Arc::new(ObfuscatedFileSystem::default()); + let entry = vec![b'x'; 1024]; + + let mut engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + engine + .write(&mut generate_batch(1, 1, 2, Some(&entry)), false) + .unwrap(); + engine + .write(&mut generate_batch(1, 2, 3, Some(&entry)), false) + .unwrap(); + engine + .write(&mut generate_batch(1, 3, 4, Some(&entry)), false) + .unwrap(); + engine + .write(&mut generate_batch(1, 4, 5, Some(&entry)), false) + .unwrap(); + assert_eq!(engine.file_span(LogQueue::Append).1, 1); + // The next write will be followed by a rotate. + { + // Fail to sync old log file. + let _f = FailGuard::new("log_fd::sync::err", "return"); + assert!(catch_unwind_silent(|| { + let _ = engine.write(&mut generate_batch(1, 4, 5, Some(&entry)), false); + }) + .is_err()); + } + if restart_after_failure { + drop(engine); + engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + } + assert_eq!(engine.file_span(LogQueue::Append).1, 1); + { + // Fail to create new log file. + let _f = FailGuard::new("default_fs::create::err", "return"); + assert!(engine + .write(&mut generate_batch(1, 4, 5, Some(&entry)), false) + .is_err()); + } + if restart_after_failure { + drop(engine); + engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + } + let num_files_before = std::fs::read_dir(&dir).unwrap().count(); + { + // Fail to write header of new log file. + let _f = FailGuard::new("log_file::write::err", "1*off->return"); + assert!(engine + .write(&mut generate_batch(1, 4, 5, Some(&entry)), false) + .is_err()); + } + if restart_after_failure { + drop(engine); + engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + // The new log file is added during recovery phase of restart. + assert_eq!(engine.file_span(LogQueue::Append).1, 2); + } else { + assert_eq!(engine.file_span(LogQueue::Append).1, 1); + } + // Although the header is not written, the file is still created. + assert_eq!( + std::fs::read_dir(&dir).unwrap().count() - num_files_before, + 1 + ); + if !restart_after_failure { + // If the engine restarted, the write does not require sync will succeed. + // Fail to sync new log file. The old log file is already sync-ed at this point. + let _f = FailGuard::new("log_fd::sync::err", "return"); + assert!(catch_unwind_silent(|| { + let _ = engine.write(&mut generate_batch(1, 4, 5, Some(&entry)), false); + }) + .is_err()); + assert_eq!(engine.file_span(LogQueue::Append).1, 1); + } + + // Only one log file should be created after all the incidents. + assert_eq!( + std::fs::read_dir(&dir).unwrap().count() - num_files_before, + 1 + ); + // We can continue writing after the incidents. + engine + .write(&mut generate_batch(2, 1, 2, Some(&entry)), true) + .unwrap(); + if restart_after_failure { + drop(engine); + engine = Engine::open_with_file_system(cfg, fs).unwrap(); + } + assert_eq!( + std::fs::read_dir(&dir).unwrap().count() - num_files_before, + 1 + ); + assert_eq!(engine.first_index(1).unwrap(), 1); + assert_eq!(engine.last_index(1).unwrap(), 4); + assert_eq!(engine.first_index(2).unwrap(), 1); + assert_eq!(engine.last_index(2).unwrap(), 1); +} + +#[test] +fn test_file_rotate_error_without_restart() { + test_file_rotate_error(false); +} + +#[test] +fn test_file_rotate_error_with_restart() { + test_file_rotate_error(true); +} + +#[test] +fn test_concurrent_write_error() { + let dir = tempfile::Builder::new() + .prefix("test_concurrent_write_error") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize::kb(1024), + ..Default::default() + }; + let entry = vec![b'x'; 1024]; + + // Don't use ObfuscatedFileSystem. It will split IO. + let engine = Arc::new(Engine::open(cfg.clone()).unwrap()); + let mut ctx = ConcurrentWriteContext::new(engine.clone()); + + // The second of three writes will fail. + fail::cfg("log_file::write::err", "1*off->1*return->off").unwrap(); + let entry_clone = entry.clone(); + ctx.write_ext(move |e| { + e.write(&mut generate_batch(1, 1, 11, Some(&entry_clone)), false) + .unwrap(); + }); + let entry_clone = entry.clone(); + ctx.write_ext(move |e| { + e.write(&mut generate_batch(2, 1, 11, Some(&entry_clone)), false) + .unwrap_err(); + }); + let entry_clone = entry.clone(); + ctx.write_ext(move |e| { + e.write(&mut generate_batch(3, 1, 11, Some(&entry_clone)), false) + .unwrap(); + }); + ctx.join(); + + assert_eq!( + 10, + engine + .fetch_entries_to::(1, 1, 11, None, &mut vec![]) + .unwrap() + ); + assert_eq!( + 0, + engine + .fetch_entries_to::(2, 1, 11, None, &mut vec![]) + .unwrap() + ); + assert_eq!( + 10, + engine + .fetch_entries_to::(3, 1, 11, None, &mut vec![]) + .unwrap() + ); + + { + let _f1 = FailGuard::new("log_file::write::err", "return"); + let _f2 = FailGuard::new("log_file::truncate::err", "return"); + let entry_clone = entry.clone(); + ctx.write_ext(move |e| { + e.write(&mut generate_batch(1, 11, 21, Some(&entry_clone)), false) + .unwrap_err(); + }); + // We don't test followers, their panics are hard to catch. + ctx.join(); + } + + // Internal states are consistent after panics. + engine + .write(&mut generate_batch(1, 11, 21, Some(&entry)), true) + .unwrap(); + drop(ctx); + drop(engine); + + let engine = Engine::open(cfg).unwrap(); + assert_eq!( + 20, + engine + .fetch_entries_to::(1, 1, 21, None, &mut vec![]) + .unwrap() + ); +} + +#[test] +fn test_non_atomic_write_error() { + let dir = tempfile::Builder::new() + .prefix("test_non_atomic_write_error") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize::kb(1024), + ..Default::default() + }; + let fs = Arc::new(ObfuscatedFileSystem::default()); + let entry = vec![b'x'; 1024]; + let rid = 1; + + { + // Write partially succeeds. We can reopen. + let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + let _f1 = FailGuard::new("log_file::write::err", "return"); + engine + .write(&mut generate_batch(rid, 0, 1, Some(&entry)), true) + .unwrap_err(); + } + { + let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + assert_eq!(engine.first_index(rid), None); + } + { + // Write partially succeeds. We can overwrite. + let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + let _f1 = FailGuard::new("log_file::write::err", "1*off->1*return->off"); + engine + .write(&mut generate_batch(rid, 0, 1, Some(&entry)), true) + .unwrap_err(); + engine + .write(&mut generate_batch(rid, 5, 6, Some(&entry)), true) + .unwrap(); + assert_eq!(engine.first_index(rid).unwrap(), 5); + } + { + let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + assert_eq!(engine.first_index(rid).unwrap(), 5); + } + { + // Write partially succeeds and can't be reverted. We panic. + let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + let _f1 = FailGuard::new("log_file::write::err", "return"); + let _f2 = FailGuard::new("log_file::seek::err", "return"); + assert!(catch_unwind_silent(|| { + engine + .write(&mut generate_batch(rid, 6, 7, Some(&entry)), true) + .unwrap_err(); + }) + .is_err()); + } + { + let engine = Engine::open_with_file_system(cfg, fs).unwrap(); + assert_eq!(engine.last_index(rid), Some(5)); + } +} + +#[cfg(feature = "scripting")] +#[test] +fn test_error_during_repair() { + let dir = tempfile::Builder::new() + .prefix("test_error_during_repair") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize(1), + ..Default::default() + }; + let fs = Arc::new(ObfuscatedFileSystem::default()); + let entry = vec![b'x'; 1024]; + + let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + for rid in 1..=10 { + engine + .write(&mut generate_batch(rid, 1, 11, Some(&entry)), true) + .unwrap(); + } + drop(engine); + + let script = " + fn filter_append(id, first, count, rewrite_count, queue, ifirst, ilast) { + 1 // discard incoming + } + " + .to_owned(); + { + let _f = FailGuard::new("log_file::write::err", "return"); + assert!( + Engine::unsafe_repair_with_file_system(dir.path(), None, script, fs.clone()).is_err() + ); + } + let engine = Engine::open_with_file_system(cfg, fs).unwrap(); + for rid in 1..=10 { + assert_eq!( + 10, + engine + .fetch_entries_to::(rid, 1, 11, None, &mut vec![]) + .unwrap() + ); + } +} + +#[cfg(all(feature = "swap", feature = "internals"))] +#[test] +fn test_swappy_page_create_error() { + use raft_engine::internals::SwappyAllocator; + let dir = tempfile::Builder::new() + .prefix("test_swappy_page_create_error") + .tempdir() + .unwrap(); + + let allocator = SwappyAllocator::new(dir.path(), 0); + + let mut vec: Vec = Vec::new_in(allocator.clone()); + { + let _f = FailGuard::new("swappy::page::new_failure", "return"); + vec.resize(128, 0); + assert_eq!(allocator.memory_usage(), 128); + } + vec.resize(1024, 0); + assert_eq!(allocator.memory_usage(), 0); +} + +#[test] +fn test_file_allocate_error() { + let dir = tempfile::Builder::new() + .prefix("test_file_allocate_error") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize::mb(100), + ..Default::default() + }; + let fs = Arc::new(ObfuscatedFileSystem::default()); + let entry = vec![b'x'; 1024]; + { + let _f = FailGuard::new("log_file::allocate::err", "return"); + let engine = Engine::open_with_file_system(cfg.clone(), fs.clone()).unwrap(); + engine + .write(&mut generate_batch(1, 1, 5, Some(&entry)), true) + .unwrap(); + } + let engine = Engine::open_with_file_system(cfg, fs).unwrap(); + assert_eq!(engine.first_index(1).unwrap(), 1); + assert_eq!(engine.last_index(1).unwrap(), 4); +} + +#[test] +fn test_start_with_recycled_file_allocate_error() { + let dir = tempfile::Builder::new() + .prefix("test_start_with_recycled_file_allocate_error") + .tempdir() + .unwrap(); + let cfg = Config { + dir: dir.path().to_str().unwrap().to_owned(), + target_file_size: ReadableSize::kb(1), + purge_threshold: ReadableSize::kb(10), // capacity is 12 + enable_log_recycle: true, + prefill_for_recycle: true, + ..Default::default() + }; + let entry = vec![b'x'; 1024]; + // Mock that the engine starts with the circumstance where + // the pref-reserved file with seqno[5] failed to be generated. + { + let _f = FailGuard::new("log_file::write::zero", "4*off->1*return->off"); + Engine::open(cfg.clone()).unwrap(); + } + // Extra recycled files have been supplemented. + let engine = Engine::open(cfg).unwrap(); + engine + .write(&mut generate_batch(1, 1, 5, Some(&entry)), true) + .unwrap(); + let (start, end) = engine.file_span(LogQueue::Append); + assert_eq!(start, end); + // Append several entries to make Engine reuse the recycled logs. + for r in 2..6 { + engine + .write(&mut generate_batch(r, 1, 5, Some(&entry)), true) + .unwrap(); + } + let (reused_start, reused_end) = engine.file_span(LogQueue::Append); + assert_eq!((reused_start, reused_end), (1, 5)); + assert!(reused_end > end); + assert_eq!(engine.first_index(1).unwrap(), 1); + assert_eq!(engine.last_index(1).unwrap(), 4); + assert_eq!(engine.last_index(5).unwrap(), 4); + let mut entries = Vec::new(); + engine + .fetch_entries_to::(5, 1, 5, None, &mut entries) + .unwrap(); + // Continously append entries to reach the purge_threshold. + for r in 6..=15 { + engine + .write(&mut generate_batch(r, 1, 5, Some(&entry)), true) + .unwrap(); + } + assert_eq!(engine.file_span(LogQueue::Append).0, reused_start); + assert!(engine.file_span(LogQueue::Append).1 > reused_end); + let (start, _) = engine.file_span(LogQueue::Append); + // Purge and check. + engine.purge_expired_files().unwrap(); + assert!(engine.file_span(LogQueue::Append).0 > start); +} + +#[test] +fn test_no_space_write_error() { + let mut cfg_list = [ + Config { + target_file_size: ReadableSize::kb(2), + format_version: Version::V1, + enable_log_recycle: false, + ..Default::default() + }, + Config { + target_file_size: ReadableSize::kb(2), + format_version: Version::V2, + enable_log_recycle: true, + ..Default::default() + }, + ]; + let entry = vec![b'x'; 1024]; + for cfg in cfg_list.iter_mut() { + let dir = tempfile::Builder::new() + .prefix("test_no_space_write_error_main") + .tempdir() + .unwrap(); + let spill_dir = tempfile::Builder::new() + .prefix("test_no_space_write_error_spill") + .tempdir() + .unwrap(); + cfg.dir = dir.path().to_str().unwrap().to_owned(); + cfg.spill_dir = Some(spill_dir.path().to_str().unwrap().to_owned()); + { + // Case 1: `Write` is abnormal for no space left, Engine should fail at + // `rotate`. + let cfg_err = Config { + target_file_size: ReadableSize(1), + ..cfg.clone() + }; + let engine = Engine::open(cfg_err).unwrap(); + let _f = FailGuard::new("log_fd::write::no_space_err", "return"); + assert!(engine + .write(&mut generate_batch(2, 11, 21, Some(&entry)), true) + .is_err()); + assert_eq!( + 0, + engine + .fetch_entries_to::(2, 11, 21, None, &mut vec![]) + .unwrap() + ); + } + { + let engine = Engine::open(cfg.clone()).unwrap(); + // Case 2: disk goes from `full(nospace err)` -> `spare for writing`. + let _f1 = FailGuard::new("log_fd::write::no_space_err", "2*return->off"); + let _f2 = FailGuard::new("file_pipe_log::force_choose_dir", "return"); + // The first write should fail, because all dirs run out of space for writing. + assert!(engine + .write(&mut generate_batch(2, 11, 21, Some(&entry)), true) + .is_err()); + assert_eq!( + 0, + engine + .fetch_entries_to::(2, 11, 21, None, &mut vec![]) + .unwrap() + ); + // The second write should success, as there exists free space for later writing + // after cleaning up. + engine + .write(&mut generate_batch(3, 11, 21, Some(&entry)), true) + .unwrap(); + assert_eq!( + 10, + engine + .fetch_entries_to::(3, 11, 21, None, &mut vec![]) + .unwrap() + ); + } + { + // Case 3: disk status -- `main dir is full (has nospace err)` -> `spill-dir + // is spare (has enough space)`. + let engine = Engine::open(cfg.clone()).unwrap(); + let _f1 = FailGuard::new("log_fd::write::no_space_err", "1*return->off"); + let _f2 = FailGuard::new("file_pipe_log::force_choose_dir", "return(1)"); + engine + .write(&mut generate_batch(5, 11, 21, Some(&entry)), true) + .unwrap(); + engine + .write(&mut generate_batch(6, 11, 21, Some(&entry)), true) + .unwrap(); + assert_eq!( + 10, + engine + .fetch_entries_to::(5, 11, 21, None, &mut vec![]) + .unwrap() + ); + assert_eq!( + 10, + engine + .fetch_entries_to::(6, 11, 21, None, &mut vec![]) + .unwrap() + ); + } + { + // Case 4: disk status -- `main dir has free space for rotating new files + // but no space for dumping LogBatch`, disk goes into endless `spare(nospace + // err)`, engine do panic for multi-retrying. + let engine = Engine::open(cfg.clone()).unwrap(); + let _f = FailGuard::new( + "log_fd::write::no_space_err", + "1*return->1*off->1*return->1*off", + ); + assert!(engine + .write(&mut generate_batch(7, 11, 21, Some(&entry)), true) + .is_err()); + assert_eq!( + 0, + engine + .fetch_entries_to::(7, 11, 21, None, &mut vec![]) + .unwrap() + ); + } + } +} diff --git a/third/raft-engine/tests/failpoints/util.rs b/third/raft-engine/tests/failpoints/util.rs new file mode 100644 index 00000000..188f66d7 --- /dev/null +++ b/third/raft-engine/tests/failpoints/util.rs @@ -0,0 +1,120 @@ +// Copyright (c) 2017-present, PingCAP, Inc. Licensed under Apache-2.0. + +use std::panic::{self, AssertUnwindSafe}; +use std::sync::{mpsc, Arc}; + +use raft::eraftpb::Entry; +use raft_engine::env::FileSystem; +use raft_engine::{Engine, LogBatch, MessageExt}; + +#[derive(Clone)] +pub struct MessageExtTyped; +impl MessageExt for MessageExtTyped { + type Entry = Entry; + + fn index(entry: &Entry) -> u64 { + entry.index + } +} + +pub fn generate_entries(begin_index: u64, end_index: u64, data: Option<&[u8]>) -> Vec { + let mut v = vec![Entry::new(); (end_index - begin_index) as usize]; + let mut index = begin_index; + for e in v.iter_mut() { + e.set_index(index); + if let Some(data) = data { + e.set_data(data.to_vec().into()) + } + index += 1; + } + v +} + +pub fn generate_batch( + region: u64, + begin_index: u64, + end_index: u64, + data: Option<&[u8]>, +) -> LogBatch { + let mut batch = LogBatch::default(); + batch + .add_entries::(region, &generate_entries(begin_index, end_index, data)) + .unwrap(); + batch +} + +/// Catch panic while suppressing default panic hook. +pub fn catch_unwind_silent(f: F) -> std::thread::Result +where + F: FnOnce() -> R, +{ + let prev_hook = panic::take_hook(); + panic::set_hook(Box::new(|_| {})); + let result = panic::catch_unwind(AssertUnwindSafe(f)); + panic::set_hook(prev_hook); + result +} + +pub struct ConcurrentWriteContext { + engine: Arc>, + ths: Vec>, +} + +impl ConcurrentWriteContext { + pub fn new(engine: Arc>) -> Self { + Self { + engine, + ths: Vec::new(), + } + } + + pub fn write(&mut self, mut log_batch: LogBatch) { + self.write_ext(move |e| { + e.write(&mut log_batch, true).unwrap(); + }); + } + + pub fn write_ext(&mut self, f: F) + where + F: FnOnce(&Engine) + Send + Sync + 'static, + { + let (ready_tx, ready_rx) = mpsc::channel(); + if self.ths.is_empty() { + fail::cfg("write_barrier::leader_exit", "pause").unwrap(); + let engine_clone = self.engine.clone(); + let ready_tx_clone = ready_tx.clone(); + self.ths.push( + std::thread::Builder::new() + .spawn(move || { + ready_tx_clone.send(()).unwrap(); + // No-op. + engine_clone.write(&mut LogBatch::default(), false).unwrap(); + }) + .unwrap(), + ); + std::thread::sleep(std::time::Duration::from_millis(100)); + ready_rx.recv().unwrap(); + } else { + // Follower. + assert!(self.ths.len() >= 2); + } + let engine_clone = self.engine.clone(); + self.ths.push( + std::thread::Builder::new() + .spawn(move || { + ready_tx.send(()).unwrap(); + f(&engine_clone); + }) + .unwrap(), + ); + std::thread::sleep(std::time::Duration::from_millis(100)); + ready_rx.recv().unwrap(); + } + + pub fn join(&mut self) { + fail::remove("write_barrier::leader_exit"); + for t in self.ths.drain(..) { + t.join().unwrap(); + } + } +} diff --git a/tools/ci/licenserc.yml b/tools/ci/licenserc.yml index f6b3f4bc..aac2ef1f 100644 --- a/tools/ci/licenserc.yml +++ b/tools/ci/licenserc.yml @@ -22,3 +22,4 @@ header: - '**/*.proto' paths-ignore: - 'src/server/proto/include/*.proto' + - 'third/'