From 494f50752e9eb9c53cca0be998c7351f3162d9e0 Mon Sep 17 00:00:00 2001 From: bright-starry-sky <56461666+bright-starry-sky@users.noreply.github.com> Date: Fri, 4 Sep 2020 11:51:08 +0800 Subject: [PATCH] Improved snapshot creation logic. (#2318) --- src/kvstore/NebulaStore.cpp | 32 +++++++++---------- src/kvstore/RocksEngine.cpp | 7 ++-- src/kvstore/raftex/RaftPart.cpp | 8 +++-- src/kvstore/wal/FileBasedWal.cpp | 7 ++++ src/meta/processors/admin/AdminClient.cpp | 10 ++---- .../admin/CreateSnapshotProcessor.cpp | 1 - 6 files changed, 34 insertions(+), 31 deletions(-) diff --git a/src/kvstore/NebulaStore.cpp b/src/kvstore/NebulaStore.cpp index 8b016ef39fd..7f6c66266ab 100644 --- a/src/kvstore/NebulaStore.cpp +++ b/src/kvstore/NebulaStore.cpp @@ -794,23 +794,21 @@ ResultCode NebulaStore::setWriteBlocking(GraphSpaceID spaceId, bool sign) { return error(partRet); } auto p = nebula::value(partRet); - if (p->isLeader()) { - auto ret = ResultCode::SUCCEEDED; - p->setBlocking(sign); - if (sign) { - folly::Baton baton; - p->sync([&ret, &baton] (kvstore::ResultCode code) { - if (kvstore::ResultCode::SUCCEEDED != code) { - ret = code; - } - baton.post(); - }); - baton.wait(); - } - if (ret != ResultCode::SUCCEEDED) { - LOG(ERROR) << "Part sync failed. space : " << spaceId << " Part : " << part; - return ret; - } + auto ret = ResultCode::SUCCEEDED; + p->setBlocking(sign); + if (sign) { + folly::Baton baton; + p->sync([&ret, &baton] (kvstore::ResultCode code) { + if (kvstore::ResultCode::SUCCEEDED != code) { + ret = code; + } + baton.post(); + }); + baton.wait(); + } + if (ret != ResultCode::SUCCEEDED) { + LOG(ERROR) << "Part sync failed. space : " << spaceId << " Part : " << part; + return ret; } } } diff --git a/src/kvstore/RocksEngine.cpp b/src/kvstore/RocksEngine.cpp index ed611a07dae..53dffd1f7f2 100644 --- a/src/kvstore/RocksEngine.cpp +++ b/src/kvstore/RocksEngine.cpp @@ -448,9 +448,10 @@ ResultCode RocksEngine::createCheckpoint(const std::string& name) { auto checkpointPath = folly::stringPrintf("%s/checkpoints/%s/data", dataPath_.c_str(), name.c_str()); LOG(INFO) << "Target checkpoint path : " << checkpointPath; - if (fs::FileUtils::exist(checkpointPath)) { - LOG(ERROR) << "The snapshot file already exists: " << checkpointPath; - return ResultCode::ERR_CHECKPOINT_ERROR; + if (fs::FileUtils::exist(checkpointPath) && + !fs::FileUtils::remove(checkpointPath.data(), true)) { + LOG(ERROR) << "Remove exist dir failed of checkpoint : " << checkpointPath; + return ResultCode::ERR_IO_ERROR; } auto parent = checkpointPath.substr(0, checkpointPath.rfind('/')); diff --git a/src/kvstore/raftex/RaftPart.cpp b/src/kvstore/raftex/RaftPart.cpp index aef427893f4..701d3c0b06c 100644 --- a/src/kvstore/raftex/RaftPart.cpp +++ b/src/kvstore/raftex/RaftPart.cpp @@ -554,10 +554,12 @@ folly::Future RaftPart::appendLogAsync(ClusterID source, LogType logType, std::string log, AtomicOp op) { - if (blocking_ && (logType == LogType::NORMAL || logType == LogType::ATOMIC_OP)) { - return AppendLogResult::E_WRITE_BLOCKING; + if (blocking_) { + // No need to block heartbeats and empty log. + if ((logType == LogType::NORMAL && !log.empty()) || logType == LogType::ATOMIC_OP) { + return AppendLogResult::E_WRITE_BLOCKING; + } } - LogCache swappedOutLogs; auto retFuture = folly::Future::makeEmpty(); diff --git a/src/kvstore/wal/FileBasedWal.cpp b/src/kvstore/wal/FileBasedWal.cpp index 72c5896ca00..71bc29da80c 100644 --- a/src/kvstore/wal/FileBasedWal.cpp +++ b/src/kvstore/wal/FileBasedWal.cpp @@ -620,6 +620,13 @@ bool FileBasedWal::linkCurrentWAL(const char* newPath) { LOG(INFO) << idStr_ << "No wal files found, skip link"; return true; } + + if (fs::FileUtils::exist(newPath) && + !fs::FileUtils::remove(newPath, true)) { + LOG(ERROR) << "Remove exist dir failed of wal : " << newPath; + return false; + } + if (!fs::FileUtils::makeDir(newPath)) { LOG(INFO) << idStr_ << "Link file parent dir make failed : " << newPath; return false; diff --git a/src/meta/processors/admin/AdminClient.cpp b/src/meta/processors/admin/AdminClient.cpp index 80ee94c4d72..9d3ddbed1e3 100644 --- a/src/meta/processors/admin/AdminClient.cpp +++ b/src/meta/processors/admin/AdminClient.cpp @@ -600,13 +600,9 @@ folly::Future AdminClient::createSnapshot(GraphSpaceID spaceId, folly::Promise pro; auto f = pro.getFuture(); - /** - * Don't need retry. - * Because existing checkpoint directories leads to fail again. - **/ getResponse({host}, 0, std::move(req), [] (auto client, auto request) { return client->future_createCheckpoint(request); - }, 0, std::move(pro), 0); + }, 0, std::move(pro), 3 /*The snapshot operation need to retry 3 times*/); return f; } @@ -625,7 +621,7 @@ folly::Future AdminClient::dropSnapshot(GraphSpaceID spaceId, auto f = pro.getFuture(); getResponse({host}, 0, std::move(req), [] (auto client, auto request) { return client->future_dropCheckpoint(request); - }, 0, std::move(pro), 1 /*The snapshot operation only needs to be retried twice*/); + }, 0, std::move(pro), 3 /*The snapshot operation need to retry 3 times*/); return f; } @@ -640,7 +636,7 @@ folly::Future AdminClient::blockingWrites(GraphSpaceID spaceId, auto f = pro.getFuture(); getResponse({host}, 0, std::move(req), [] (auto client, auto request) { return client->future_blockingWrites(request); - }, 0, std::move(pro), 1 /*The blocking needs to be retried twice*/); + }, 0, std::move(pro), 32 /*The blocking need to retry 32 times*/); return f; } diff --git a/src/meta/processors/admin/CreateSnapshotProcessor.cpp b/src/meta/processors/admin/CreateSnapshotProcessor.cpp index 1093057db56..dd8b10222a7 100644 --- a/src/meta/processors/admin/CreateSnapshotProcessor.cpp +++ b/src/meta/processors/admin/CreateSnapshotProcessor.cpp @@ -13,7 +13,6 @@ namespace nebula { namespace meta { void CreateSnapshotProcessor::process(const cpp2::CreateSnapshotReq&) { - folly::SharedMutex::ReadHolder rHolder(LockUtils::spaceLock()); // check the index rebuild. not allowed to create snapshot when index rebuilding. auto prefix = MetaServiceUtils::rebuildIndexStatusPrefix(); std::unique_ptr iter;