From d1c93b8142602095d9e99e5ec0b9bfae492df00f Mon Sep 17 00:00:00 2001 From: SuperYoko <90179377+SuperYoko@users.noreply.github.com> Date: Wed, 14 Sep 2022 09:11:14 +0000 Subject: [PATCH 1/2] add retry for god user check when leader changed --- src/daemons/MetaDaemon.cpp | 43 +++++++++++++++++++---------- src/daemons/StandAloneDaemon.cpp | 46 ++++++++++++++++++++------------ 2 files changed, 58 insertions(+), 31 deletions(-) diff --git a/src/daemons/MetaDaemon.cpp b/src/daemons/MetaDaemon.cpp index 5ccedd0f1ff..64feb425d29 100644 --- a/src/daemons/MetaDaemon.cpp +++ b/src/daemons/MetaDaemon.cpp @@ -7,6 +7,7 @@ #include #include "MetaDaemonInit.h" +#include "clients/meta/MetaClient.h" #include "common/base/Base.h" #include "common/base/SignalHandler.h" #include "common/fs/FileUtils.h" @@ -44,6 +45,7 @@ DEFINE_int32(port, 45500, "Meta daemon listening port"); DEFINE_bool(reuse_port, true, "Whether to turn on the SO_REUSEPORT option"); DECLARE_string(data_path); DECLARE_string(meta_server_addrs); +DECLARE_uint32(raft_heartbeat_interval_secs); DEFINE_int32(meta_http_thread_num, 3, "Number of meta daemon's http thread"); DEFINE_string(pid_file, "pids/nebula-metad.pid", "File to hold the process id"); @@ -168,27 +170,40 @@ int main(int argc, char* argv[]) { } { - /** - * Only leader part needed. - */ - auto ret = gKVStore->partLeader(nebula::kDefaultSpaceId, nebula::kDefaultPartId); - if (!nebula::ok(ret)) { - LOG(ERROR) << "Part leader get failed"; - return EXIT_FAILURE; - } - if (nebula::value(ret) == localhost) { - LOG(INFO) << "Check and init root user"; + const int kMaxRetryTime = FLAGS_raft_heartbeat_interval_secs * 2; + constexpr int kRetryInterval = 1; + int retryTime = 0; + while (true) { + auto ret = gKVStore->partLeader(nebula::kDefaultSpaceId, nebula::kDefaultPartId); + if (!nebula::ok(ret)) { + LOG(ERROR) << "Part leader get failed"; + return EXIT_FAILURE; + } + LOG(INFO) << "Check root user"; auto checkRet = nebula::meta::RootUserMan::isGodExists(gKVStore.get()); if (!nebula::ok(checkRet)) { auto retCode = nebula::error(checkRet); + if (retCode == nebula::cpp2::ErrorCode::E_LEADER_CHANGED) { + LOG(INFO) << "Leader changed, retry"; + retryTime += kRetryInterval; + if (retryTime > kMaxRetryTime) { + LOG(ERROR) << "Retry too many times"; + return EXIT_FAILURE; + } + sleep(kRetryInterval); + continue; + } LOG(ERROR) << "Parser God Role error:" << apache::thrift::util::enumNameSafe(retCode); return EXIT_FAILURE; } - auto existGod = nebula::value(checkRet); - if (!existGod && !nebula::meta::RootUserMan::initRootUser(gKVStore.get())) { - LOG(ERROR) << "Init root user failed"; - return EXIT_FAILURE; + if (nebula::value(ret) == localhost) { + auto existGod = nebula::value(checkRet); + if (!existGod && !nebula::meta::RootUserMan::initRootUser(gKVStore.get())) { + LOG(ERROR) << "Init root user failed"; + return EXIT_FAILURE; + } } + break; } } diff --git a/src/daemons/StandAloneDaemon.cpp b/src/daemons/StandAloneDaemon.cpp index 5b519f726f5..f9dd39596ce 100644 --- a/src/daemons/StandAloneDaemon.cpp +++ b/src/daemons/StandAloneDaemon.cpp @@ -224,30 +224,42 @@ int main(int argc, char *argv[]) { } { - /** - * Only leader part needed. - */ - auto ret = gMetaKVStore->partLeader(nebula::kDefaultSpaceId, nebula::kDefaultPartId); - if (!nebula::ok(ret)) { - LOG(ERROR) << "Part leader get failed"; - return; - } - if (nebula::value(ret) == metaLocalhost) { - LOG(INFO) << "Check and init root user"; - auto checkRet = nebula::meta::RootUserMan::isGodExists(gMetaKVStore.get()); + const int kMaxRetryTime = FLAGS_raft_heartbeat_interval_secs * 2; + constexpr int kRetryInterval = 1; + int retryTime = 0; + while (true) { + auto ret = gKVStore->partLeader(nebula::kDefaultSpaceId, nebula::kDefaultPartId); + if (!nebula::ok(ret)) { + LOG(ERROR) << "Part leader get failed"; + return EXIT_FAILURE; + } + LOG(INFO) << "Check root user"; + auto checkRet = nebula::meta::RootUserMan::isGodExists(gKVStore.get()); if (!nebula::ok(checkRet)) { auto retCode = nebula::error(checkRet); + if (retCode == nebula::cpp2::ErrorCode::E_LEADER_CHANGED) { + LOG(INFO) << "Leader changed, retry"; + retryTime += kRetryInterval; + if (retryTime > kMaxRetryTime) { + LOG(ERROR) << "Retry too many times"; + return EXIT_FAILURE; + } + sleep(kRetryInterval); + continue; + } LOG(ERROR) << "Parser God Role error:" << apache::thrift::util::enumNameSafe(retCode); - return; + return EXIT_FAILURE; } - auto existGod = nebula::value(checkRet); - if (!existGod && !nebula::meta::RootUserMan::initRootUser(gMetaKVStore.get())) { - LOG(ERROR) << "Init root user failed"; - return; + if (nebula::value(ret) == localhost) { + auto existGod = nebula::value(checkRet); + if (!existGod && !nebula::meta::RootUserMan::initRootUser(gKVStore.get())) { + LOG(ERROR) << "Init root user failed"; + return EXIT_FAILURE; + } } + break; } } - auto handler = std::make_shared(gMetaKVStore.get(), metaClusterId()); LOG(INFO) << "The meta deamon start on " << metaLocalhost; From c9abbb5a00fe244819ec02f6215e454b50e51861 Mon Sep 17 00:00:00 2001 From: SuperYoko <90179377+SuperYoko@users.noreply.github.com> Date: Tue, 20 Sep 2022 07:09:07 +0000 Subject: [PATCH 2/2] Modify init retry --- src/daemons/MetaDaemon.cpp | 42 +++--------------------- src/daemons/MetaDaemonInit.cpp | 56 ++++++++++++++++++++++++++++++++ src/daemons/MetaDaemonInit.h | 4 +++ src/daemons/StandAloneDaemon.cpp | 41 +++-------------------- src/meta/RootUserMan.h | 12 +++---- 5 files changed, 74 insertions(+), 81 deletions(-) diff --git a/src/daemons/MetaDaemon.cpp b/src/daemons/MetaDaemon.cpp index 64feb425d29..0fa5f7ebafc 100644 --- a/src/daemons/MetaDaemon.cpp +++ b/src/daemons/MetaDaemon.cpp @@ -26,7 +26,6 @@ #include "meta/KVBasedClusterIdMan.h" #include "meta/MetaServiceHandler.h" #include "meta/MetaVersionMan.h" -#include "meta/RootUserMan.h" #include "meta/http/MetaHttpReplaceHostHandler.h" #include "meta/processors/job/JobManager.h" #include "meta/stats/MetaStats.h" @@ -45,7 +44,6 @@ DEFINE_int32(port, 45500, "Meta daemon listening port"); DEFINE_bool(reuse_port, true, "Whether to turn on the SO_REUSEPORT option"); DECLARE_string(data_path); DECLARE_string(meta_server_addrs); -DECLARE_uint32(raft_heartbeat_interval_secs); DEFINE_int32(meta_http_thread_num, 3, "Number of meta daemon's http thread"); DEFINE_string(pid_file, "pids/nebula-metad.pid", "File to hold the process id"); @@ -169,42 +167,10 @@ int main(int argc, char* argv[]) { } } - { - const int kMaxRetryTime = FLAGS_raft_heartbeat_interval_secs * 2; - constexpr int kRetryInterval = 1; - int retryTime = 0; - while (true) { - auto ret = gKVStore->partLeader(nebula::kDefaultSpaceId, nebula::kDefaultPartId); - if (!nebula::ok(ret)) { - LOG(ERROR) << "Part leader get failed"; - return EXIT_FAILURE; - } - LOG(INFO) << "Check root user"; - auto checkRet = nebula::meta::RootUserMan::isGodExists(gKVStore.get()); - if (!nebula::ok(checkRet)) { - auto retCode = nebula::error(checkRet); - if (retCode == nebula::cpp2::ErrorCode::E_LEADER_CHANGED) { - LOG(INFO) << "Leader changed, retry"; - retryTime += kRetryInterval; - if (retryTime > kMaxRetryTime) { - LOG(ERROR) << "Retry too many times"; - return EXIT_FAILURE; - } - sleep(kRetryInterval); - continue; - } - LOG(ERROR) << "Parser God Role error:" << apache::thrift::util::enumNameSafe(retCode); - return EXIT_FAILURE; - } - if (nebula::value(ret) == localhost) { - auto existGod = nebula::value(checkRet); - if (!existGod && !nebula::meta::RootUserMan::initRootUser(gKVStore.get())) { - LOG(ERROR) << "Init root user failed"; - return EXIT_FAILURE; - } - } - break; - } + auto godInit = initGodUser(gKVStore.get(), localhost); + if (godInit != nebula::cpp2::ErrorCode::SUCCEEDED) { + LOG(ERROR) << "Init god user failed"; + return EXIT_FAILURE; } auto metaServer = std::make_unique(); diff --git a/src/daemons/MetaDaemonInit.cpp b/src/daemons/MetaDaemonInit.cpp index ef587ec6bda..fe76db5f26b 100644 --- a/src/daemons/MetaDaemonInit.cpp +++ b/src/daemons/MetaDaemonInit.cpp @@ -10,6 +10,7 @@ #include "common/base/Base.h" #include "common/base/SignalHandler.h" +#include "common/datatypes/HostAddr.h" #include "common/fs/FileUtils.h" #include "common/hdfs/HdfsCommandHelper.h" #include "common/hdfs/HdfsHelper.h" @@ -17,12 +18,14 @@ #include "common/ssl/SSLConfig.h" #include "common/thread/GenericThreadPool.h" #include "common/utils/MetaKeyUtils.h" +#include "kvstore/KVStore.h" #include "kvstore/NebulaStore.h" #include "kvstore/PartManager.h" #include "meta/ActiveHostsMan.h" #include "meta/KVBasedClusterIdMan.h" #include "meta/MetaServiceHandler.h" #include "meta/MetaVersionMan.h" +#include "meta/RootUserMan.h" #include "meta/http/MetaHttpReplaceHostHandler.h" #include "meta/processors/job/JobManager.h" #include "meta/stats/MetaStats.h" @@ -46,6 +49,8 @@ DECLARE_string(meta_server_addrs); // use define from grap flags. DEFINE_int32(ws_meta_http_port, 11000, "Port to listen on Meta with HTTP protocol"); #endif +DECLARE_uint32(raft_heartbeat_interval_secs); + using nebula::web::PathParams; namespace nebula::meta { @@ -158,6 +163,57 @@ std::unique_ptr initKV(std::vector p return kvstore; } +nebula::cpp2::ErrorCode initGodUser(nebula::kvstore::KVStore* kvstore, + const nebula::HostAddr& localhost) { + const int kMaxRetryTime = FLAGS_raft_heartbeat_interval_secs * 3; + constexpr int kRetryInterval = 1; + int retryTime = 0; + // Both leader & follower need to wait all reading ok. + // leader init need to retry writing if leader changed. + while (true) { + retryTime += kRetryInterval; + if (retryTime > kMaxRetryTime) { + LOG(ERROR) << "Retry too many times"; + return nebula::cpp2::ErrorCode::E_RETRY_EXHAUSTED; + } + auto ret = kvstore->partLeader(nebula::kDefaultSpaceId, nebula::kDefaultPartId); + if (!nebula::ok(ret)) { + LOG(ERROR) << "Part leader get failed"; + return nebula::error(ret); + } + LOG(INFO) << "Check root user"; // follower need to wait reading all ok, too. + auto checkRet = nebula::meta::RootUserMan::isGodExists(kvstore); + if (!nebula::ok(checkRet)) { + auto retCode = nebula::error(checkRet); + if (retCode == nebula::cpp2::ErrorCode::E_LEADER_CHANGED) { + LOG(INFO) << "Leader changed, retry"; + sleep(kRetryInterval); + continue; + } + LOG(ERROR) << "Parser God Role error:" << apache::thrift::util::enumNameSafe(retCode); + return nebula::error(checkRet); + } + if (nebula::value(ret) == localhost) { + auto existGod = nebula::value(checkRet); + if (!existGod) { + auto initGod = nebula::meta::RootUserMan::initRootUser(kvstore); + if (initGod != nebula::cpp2::ErrorCode::SUCCEEDED) { + if (initGod != nebula::cpp2::ErrorCode::E_LEADER_CHANGED) { + LOG(ERROR) << "Init God Role error:" << apache::thrift::util::enumNameSafe(initGod); + return initGod; + } else { + LOG(INFO) << "Leader changed, retry"; + sleep(kRetryInterval); + continue; + } + } + } + } + break; + } + return nebula::cpp2::ErrorCode::SUCCEEDED; +} + nebula::Status initWebService(nebula::WebService* svc, nebula::kvstore::KVStore* kvstore) { LOG(INFO) << "Starting Meta HTTP Service"; auto& router = svc->router(); diff --git a/src/daemons/MetaDaemonInit.h b/src/daemons/MetaDaemonInit.h index 0a94ae4bbd4..dfcd8077e06 100644 --- a/src/daemons/MetaDaemonInit.h +++ b/src/daemons/MetaDaemonInit.h @@ -9,6 +9,7 @@ #include "common/base/Status.h" #include "common/hdfs/HdfsCommandHelper.h" +#include "interface/gen-cpp2/common_types.h" #include "kvstore/KVStore.h" #include "webservice/WebService.h" @@ -18,4 +19,7 @@ std::unique_ptr initKV(std::vector p nebula::HostAddr localhost); nebula::Status initWebService(nebula::WebService* svc, nebula::kvstore::KVStore* kvstore); + +nebula::cpp2::ErrorCode initGodUser(nebula::kvstore::KVStore* kvstore, + const nebula::HostAddr& localhost); #endif diff --git a/src/daemons/StandAloneDaemon.cpp b/src/daemons/StandAloneDaemon.cpp index f9dd39596ce..f85b9f4fddc 100644 --- a/src/daemons/StandAloneDaemon.cpp +++ b/src/daemons/StandAloneDaemon.cpp @@ -223,43 +223,12 @@ int main(int argc, char *argv[]) { } } - { - const int kMaxRetryTime = FLAGS_raft_heartbeat_interval_secs * 2; - constexpr int kRetryInterval = 1; - int retryTime = 0; - while (true) { - auto ret = gKVStore->partLeader(nebula::kDefaultSpaceId, nebula::kDefaultPartId); - if (!nebula::ok(ret)) { - LOG(ERROR) << "Part leader get failed"; - return EXIT_FAILURE; - } - LOG(INFO) << "Check root user"; - auto checkRet = nebula::meta::RootUserMan::isGodExists(gKVStore.get()); - if (!nebula::ok(checkRet)) { - auto retCode = nebula::error(checkRet); - if (retCode == nebula::cpp2::ErrorCode::E_LEADER_CHANGED) { - LOG(INFO) << "Leader changed, retry"; - retryTime += kRetryInterval; - if (retryTime > kMaxRetryTime) { - LOG(ERROR) << "Retry too many times"; - return EXIT_FAILURE; - } - sleep(kRetryInterval); - continue; - } - LOG(ERROR) << "Parser God Role error:" << apache::thrift::util::enumNameSafe(retCode); - return EXIT_FAILURE; - } - if (nebula::value(ret) == localhost) { - auto existGod = nebula::value(checkRet); - if (!existGod && !nebula::meta::RootUserMan::initRootUser(gKVStore.get())) { - LOG(ERROR) << "Init root user failed"; - return EXIT_FAILURE; - } - } - break; - } + auto godInit = initGodUser(gMetaKVStore.get(), metaLocalhost); + if (godInit != nebula::cpp2::ErrorCode::SUCCEEDED) { + LOG(ERROR) << "Init god user failed"; + return; } + auto handler = std::make_shared(gMetaKVStore.get(), metaClusterId()); LOG(INFO) << "The meta deamon start on " << metaLocalhost; diff --git a/src/meta/RootUserMan.h b/src/meta/RootUserMan.h index c8bde933203..af3e454f312 100644 --- a/src/meta/RootUserMan.h +++ b/src/meta/RootUserMan.h @@ -10,6 +10,7 @@ #include "common/base/Base.h" #include "common/utils/MetaKeyUtils.h" +#include "interface/gen-cpp2/common_types.h" #include "kvstore/KVStore.h" namespace nebula { @@ -53,7 +54,7 @@ class RootUserMan { } } - static bool initRootUser(kvstore::KVStore* kv) { + static nebula::cpp2::ErrorCode initRootUser(kvstore::KVStore* kv) { LOG(INFO) << "Init root user"; auto encodedPwd = proxygen::md5Encode(folly::StringPiece("nebula")); auto userKey = MetaKeyUtils::userKey("root"); @@ -64,18 +65,15 @@ class RootUserMan { std::vector data; data.emplace_back(std::move(userKey), std::move(userVal)); data.emplace_back(std::move(roleKey), std::move(roleVal)); - bool ret = true; + nebula::cpp2::ErrorCode ec; folly::Baton baton; kv->asyncMultiPut( kDefaultSpaceId, kDefaultPartId, std::move(data), [&](nebula::cpp2::ErrorCode code) { - if (code != nebula::cpp2::ErrorCode::SUCCEEDED) { - LOG(INFO) << "Put failed, error " << static_cast(code); - ret = false; - } + ec = code; baton.post(); }); baton.wait(); - return ret; + return ec; } };