From 140fb17e4b4c466a2192e9facf513a92feaeb9b2 Mon Sep 17 00:00:00 2001 From: chenbo Date: Mon, 19 Sep 2022 21:53:15 +0800 Subject: [PATCH 01/29] photon + rocksdb a --- CMakeLists.txt | 27 +- db/db_compaction_test.cc | 1 + db/db_write_test.cc | 1 + db/dbformat_test.cc | 1 + db/range_del_aggregator_test.cc | 1 + db/range_tombstone_fragmenter_test.cc | 1 + db/write_callback_test.cc | 3 + env/env_posix.cc | 59 ++-- env/io_posix.cc | 36 +- examples/CMakeLists.txt | 7 + examples/perf/perf-client.cpp | 79 +++++ examples/perf/perf-server.cpp | 308 ++++++++++++++++++ examples/perf/protocol.h | 23 ++ include/rocksdb/env.h | 17 + memtable/inlineskiplist_test.cc | 1 + memtable/skiplist_test.cc | 1 + monitoring/iostats_context.cc | 4 +- monitoring/iostats_context_imp.h | 18 +- monitoring/iostats_context_test.cc | 1 + monitoring/perf_context.cc | 4 +- monitoring/perf_context_imp.h | 30 +- monitoring/perf_level.cc | 6 +- monitoring/perf_level_imp.h | 2 +- monitoring/perf_step_timer.h | 2 +- monitoring/thread_status_updater.cc | 3 +- monitoring/thread_status_updater.h | 4 +- monitoring/thread_status_util.cc | 7 +- monitoring/thread_status_util.h | 5 +- photon-auto-convert.sh | 17 + photon-bench.md | 30 ++ photon.md | 13 + port/port_posix.cc | 61 +--- port/port_posix.h | 17 +- table/block_based_filter_block_test.cc | 1 + table/full_filter_block_test.cc | 1 + table/merger_test.cc | 1 + tools/db_bench_tool.cc | 2 +- util/concurrent_arena.cc | 4 +- util/concurrent_arena.h | 4 +- util/mutexlock.h | 6 +- util/repeatable_thread.h | 10 - util/thread_local.cc | 50 +-- util/threadpool_imp.cc | 35 -- util/timer_queue_test.cc | 2 + .../cassandra/cassandra_functional_test.cc | 1 + .../string_append/stringappend_test.cc | 1 + utilities/persistent_cache/hash_table_test.cc | 1 + .../transactions/transaction_db_mutex_impl.cc | 1 + .../write_prepared_transaction_test.cc | 6 + 49 files changed, 679 insertions(+), 237 deletions(-) create mode 100644 examples/CMakeLists.txt create mode 100644 examples/perf/perf-client.cpp create mode 100644 examples/perf/perf-server.cpp create mode 100644 examples/perf/protocol.h create mode 100755 photon-auto-convert.sh create mode 100644 photon-bench.md create mode 100644 photon.md diff --git a/CMakeLists.txt b/CMakeLists.txt index 132d3b04e96..17706d3a7db 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,6 +44,22 @@ endif() list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/") +include(FetchContent) +set(FETCHCONTENT_QUIET false) +FetchContent_Declare( + photon + GIT_REPOSITORY https://github.com/alibaba/PhotonLibOS.git + GIT_TAG v0.5.3 +) +set(ENABLE_URING ON CACHE INTERNAL "Enable iouring") +FetchContent_MakeAvailable(photon) +set(PHOTON_INCLUDE_DIR ${photon_SOURCE_DIR}/include/) + +option(INIT_PHOTON_IN_ROCKSDB "INIT PHOTON IN ROCKSDB" OFF) +if(INIT_PHOTON_IN_ROCKSDB) + add_definitions(-DINIT_PHOTON_IN_ROCKSDB) +endif() + option(WITH_JEMALLOC "build with JeMalloc" OFF) option(WITH_SNAPPY "build with SNAPPY" OFF) option(WITH_LZ4 "build with lz4" OFF) @@ -178,7 +194,7 @@ else() if(MINGW) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format") endif() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer") include(CheckCXXCompilerFlag) @@ -459,6 +475,7 @@ endif() include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_SOURCE_DIR}/include) include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/third-party/gtest-1.7.0/fused-src) +include_directories(${PHOTON_INCLUDE_DIR}) find_package(Threads REQUIRED) add_subdirectory(third-party/gtest-1.7.0/fused-src/gtest) @@ -742,18 +759,18 @@ else() add_library(${ROCKSDB_SHARED_LIB} SHARED ${SOURCES}) target_link_libraries(${ROCKSDB_SHARED_LIB} - ${THIRDPARTY_LIBS} ${SYSTEM_LIBS}) + ${THIRDPARTY_LIBS} ${SYSTEM_LIBS} -Wl,--whole-archive $ -Wl,--no-whole-archive) set_target_properties(${ROCKSDB_SHARED_LIB} PROPERTIES LINKER_LANGUAGE CXX VERSION ${ROCKSDB_VERSION} SOVERSION ${ROCKSDB_VERSION_MAJOR} - CXX_STANDARD 11 + CXX_STANDARD 14 OUTPUT_NAME "rocksdb") endif() add_library(${ROCKSDB_STATIC_LIB} STATIC ${SOURCES}) target_link_libraries(${ROCKSDB_STATIC_LIB} - ${THIRDPARTY_LIBS} ${SYSTEM_LIBS}) + ${THIRDPARTY_LIBS} ${SYSTEM_LIBS} $) if(WIN32) add_library(${ROCKSDB_IMPORT_LIB} SHARED ${SOURCES}) @@ -1067,3 +1084,5 @@ option(WITH_TOOLS "build with tools" ON) if(WITH_TOOLS) add_subdirectory(tools) endif() + +add_subdirectory(examples) diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index df51ef2ca2a..d6567071a3f 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -4349,6 +4349,7 @@ TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) { } // namespace rocksdb int main(int argc, char** argv) { + rocksdb::PhotonEnv::Singleton(); #if !defined(ROCKSDB_LITE) rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); diff --git a/db/db_write_test.cc b/db/db_write_test.cc index e6bab875114..69b977ac733 100644 --- a/db/db_write_test.cc +++ b/db/db_write_test.cc @@ -61,6 +61,7 @@ TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) { leader_count++; while (ready_count < kNumThreads) { // busy waiting + photon::thread_yield(); } } }); diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc index 0b16c13f573..a0471ba7e80 100644 --- a/db/dbformat_test.cc +++ b/db/dbformat_test.cc @@ -202,6 +202,7 @@ TEST_F(FormatTest, RangeTombstoneSerializeEndKey) { } // namespace rocksdb int main(int argc, char** argv) { + rocksdb::PhotonEnv::Singleton(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/db/range_del_aggregator_test.cc b/db/range_del_aggregator_test.cc index 28c8129ecb0..82a62dd8d84 100644 --- a/db/range_del_aggregator_test.cc +++ b/db/range_del_aggregator_test.cc @@ -704,6 +704,7 @@ TEST_F(RangeDelAggregatorTest, } // namespace rocksdb int main(int argc, char** argv) { + rocksdb::PhotonEnv::Singleton(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/db/range_tombstone_fragmenter_test.cc b/db/range_tombstone_fragmenter_test.cc index ddd3f774176..2b76a9de9e9 100644 --- a/db/range_tombstone_fragmenter_test.cc +++ b/db/range_tombstone_fragmenter_test.cc @@ -547,6 +547,7 @@ TEST_F(RangeTombstoneFragmenterTest, SeekOutOfBounds) { } // namespace rocksdb int main(int argc, char** argv) { + rocksdb::PhotonEnv::Singleton(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc index cb880560efc..8860938525e 100644 --- a/db/write_callback_test.cc +++ b/db/write_callback_test.cc @@ -185,6 +185,7 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) { // This allows us to confidently detect the first writer // who increases threads_linked as the leader. while (threads_linked.load() < cur_threads_joining) { + std::this_thread::yield(); } }); @@ -258,11 +259,13 @@ TEST_F(WriteCallbackTest, WriteWithCallbackTest) { // leaders gotta lead while (i > 0 && threads_verified.load() < 1) { + std::this_thread::yield(); } // loser has to lose while (i == write_group.size() - 1 && threads_verified.load() < write_group.size() - 1) { + std::this_thread::yield(); } auto& write_op = write_group.at(i); diff --git a/env/env_posix.cc b/env/env_posix.cc index 387c0279397..d55ce71284a 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -12,7 +12,6 @@ #if defined(OS_LINUX) #include #endif -#include #include #include #include @@ -120,8 +119,9 @@ class PosixEnv : public Env { PosixEnv(); ~PosixEnv() override { - for (const auto tid : threads_to_join_) { - pthread_join(tid, nullptr); + LOG_INFO("global PosixEnv destruct: Join thread pools"); + for (auto& tid : threads_to_join_) { + tid.join(); } for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { thread_pools_[pool_id].JoinAllThreads(); @@ -760,18 +760,11 @@ class PosixEnv : public Env { return thread_status_updater_->GetThreadList(thread_list); } - static uint64_t gettid(pthread_t tid) { - uint64_t thread_id = 0; - memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); - return thread_id; - } - static uint64_t gettid() { - pthread_t tid = pthread_self(); - return gettid(tid); + return (uint64_t) photon::CURRENT; } - uint64_t GetThreadID() const override { return gettid(pthread_self()); } + uint64_t GetThreadID() const override { return gettid(); } Status GetFreeSpace(const std::string& fname, uint64_t* free_space) override { struct statvfs sbuf; @@ -847,7 +840,7 @@ class PosixEnv : public Env { return 0; } - void SleepForMicroseconds(int micros) override { usleep(micros); } + void SleepForMicroseconds(int micros) override { std::this_thread::sleep_for(std::chrono::microseconds(micros)); } Status GetHostName(char* name, uint64_t len) override { int ret = gethostname(name, static_cast(len)); @@ -1008,8 +1001,8 @@ class PosixEnv : public Env { size_t page_size_; std::vector thread_pools_; - pthread_mutex_t mu_; - std::vector threads_to_join_; + std::mutex mu_; + std::vector threads_to_join_; // If true, allow non owner read access for db files. Otherwise, non-owner // has no access to db files. bool allow_non_owner_access_; @@ -1021,7 +1014,6 @@ PosixEnv::PosixEnv() page_size_(getpagesize()), thread_pools_(Priority::TOTAL), allow_non_owner_access_(true) { - ThreadPoolImpl::PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr)); for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { thread_pools_[pool_id].SetThreadPriority( static_cast(pool_id)); @@ -1059,20 +1051,16 @@ static void* StartThreadWrapper(void* arg) { } void PosixEnv::StartThread(void (*function)(void* arg), void* arg) { - pthread_t t; StartThreadState* state = new StartThreadState; state->user_function = function; state->arg = arg; - ThreadPoolImpl::PthreadCall( - "start thread", pthread_create(&t, nullptr, &StartThreadWrapper, state)); - ThreadPoolImpl::PthreadCall("lock", pthread_mutex_lock(&mu_)); - threads_to_join_.push_back(t); - ThreadPoolImpl::PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + std::lock_guard lock(mu_); + threads_to_join_.emplace_back(std::thread(&StartThreadWrapper, state)); } void PosixEnv::WaitForJoin() { - for (const auto tid : threads_to_join_) { - pthread_join(tid, nullptr); + for (auto& tid : threads_to_join_) { + tid.join(); } threads_to_join_.clear(); } @@ -1104,6 +1092,26 @@ std::string Env::GenerateUniqueId() { return uuid2; } +PhotonEnv::PhotonEnv() { + int ret = photon::init(photon::INIT_EVENT_IOURING, photon::INIT_IO_NONE); + if (ret != 0) { + LOG_FATAL("photon init failed"); + abort(); + } + // Max 8 vcpu. Hardcoded for now. + ret = photon_std::work_pool_init(8, photon::INIT_EVENT_IOURING, photon::INIT_IO_NONE); + if (ret != 0) { + LOG_FATAL("work pool init failed"); + abort(); + } +} + +PhotonEnv::~PhotonEnv() { + photon_std::work_pool_fini(); + photon::fini(); + LOG_INFO("PhotonEnv finished"); +} + // // Default Posix Env // @@ -1118,6 +1126,9 @@ Env* Env::Default() { // of their construction, having this call here guarantees that // the destructor of static PosixEnv will go first, then the // the singletons of ThreadLocalPtr. +#ifdef INIT_PHOTON_IN_ROCKSDB + PhotonEnv::Singleton(); +#endif ThreadLocalPtr::InitSingletons(); CompressionContextCache::InitSingleton(); INIT_SYNC_POINT_SINGLETONS(); diff --git a/env/io_posix.cc b/env/io_posix.cc index 628ed841300..2fdada93358 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -201,11 +201,8 @@ Status PosixSequentialFile::PositionedRead(uint64_t offset, size_t n, size_t left = n; char* ptr = scratch; while (left > 0) { - r = pread(fd_, ptr, left, static_cast(offset)); + r = photon::iouring_pread(fd_, ptr, left, offset, -1); if (r <= 0) { - if (r == -1 && errno == EINTR) { - continue; - } break; } ptr += r; @@ -335,11 +332,8 @@ Status PosixRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result, size_t left = n; char* ptr = scratch; while (left > 0) { - r = pread(fd_, ptr, left, static_cast(offset)); + r = photon::iouring_pread(fd_, ptr, left, offset, -1); if (r <= 0) { - if (r == -1 && errno == EINTR) { - continue; - } break; } ptr += r; @@ -761,6 +755,7 @@ Status PosixWritableFile::Append(const Slice& data) { size_t left = data.size(); while (left != 0) { ssize_t done = write(fd_, src, left); + std::this_thread::yield(); if (done < 0) { if (errno == EINTR) { continue; @@ -784,11 +779,8 @@ Status PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset) { const char* src = data.data(); size_t left = data.size(); while (left != 0) { - ssize_t done = pwrite(fd_, src, left, static_cast(offset)); + ssize_t done = photon::iouring_pwrite(fd_, src, left, offset, -1);; if (done < 0) { - if (errno == EINTR) { - continue; - } return IOError("While pwrite to file at offset " + ToString(offset), filename_, errno); } @@ -870,14 +862,14 @@ Status PosixWritableFile::Close() { Status PosixWritableFile::Flush() { return Status::OK(); } Status PosixWritableFile::Sync() { - if (fdatasync(fd_) < 0) { + if (photon::iouring_fdatasync(fd_) < 0) { return IOError("While fdatasync", filename_, errno); } return Status::OK(); } Status PosixWritableFile::Fsync() { - if (fsync(fd_) < 0) { + if (photon::iouring_fsync(fd_) < 0) { return IOError("While fsync", filename_, errno); } return Status::OK(); @@ -984,13 +976,9 @@ Status PosixRandomRWFile::Write(uint64_t offset, const Slice& data) { const char* src = data.data(); size_t left = data.size(); while (left != 0) { - ssize_t done = pwrite(fd_, src, left, offset); + ssize_t done = photon::iouring_pwrite(fd_, src, left, offset, -1); if (done < 0) { // error while writing to file - if (errno == EINTR) { - // write was interrupted, try again. - continue; - } return IOError( "While write random read/write file at offset " + ToString(offset), filename_, errno); @@ -1010,13 +998,9 @@ Status PosixRandomRWFile::Read(uint64_t offset, size_t n, Slice* result, size_t left = n; char* ptr = scratch; while (left > 0) { - ssize_t done = pread(fd_, ptr, left, offset); + ssize_t done = photon::iouring_pread(fd_, ptr, left, offset, -1); if (done < 0) { // error while reading from file - if (errno == EINTR) { - // read was interrupted, try again. - continue; - } return IOError("While reading random read/write file offset " + ToString(offset) + " len " + ToString(n), filename_, errno); @@ -1038,14 +1022,14 @@ Status PosixRandomRWFile::Read(uint64_t offset, size_t n, Slice* result, Status PosixRandomRWFile::Flush() { return Status::OK(); } Status PosixRandomRWFile::Sync() { - if (fdatasync(fd_) < 0) { + if (photon::iouring_fdatasync(fd_) < 0) { return IOError("While fdatasync random read/write file", filename_, errno); } return Status::OK(); } Status PosixRandomRWFile::Fsync() { - if (fsync(fd_) < 0) { + if (photon::iouring_fsync(fd_) < 0) { return IOError("While fsync random read/write file", filename_, errno); } return Status::OK(); diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt new file mode 100644 index 00000000000..9248c920d9f --- /dev/null +++ b/examples/CMakeLists.txt @@ -0,0 +1,7 @@ +add_executable(perf-client perf/perf-client.cpp) +target_include_directories(perf-client PRIVATE ${PROJECT_SOURCE_DIR}/include ${PHOTON_INCLUDE_DIR}) +target_link_libraries(perf-client ${ROCKSDB_STATIC_LIB}) + +add_executable(perf-server perf/perf-server.cpp) +target_include_directories(perf-server PRIVATE ${PROJECT_SOURCE_DIR}/include ${PHOTON_INCLUDE_DIR}) +target_link_libraries(perf-server ${ROCKSDB_STATIC_LIB}) \ No newline at end of file diff --git a/examples/perf/perf-client.cpp b/examples/perf/perf-client.cpp new file mode 100644 index 00000000000..e84c5ede914 --- /dev/null +++ b/examples/perf/perf-client.cpp @@ -0,0 +1,79 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "protocol.h" + +DEFINE_int32(port, 9527, "server port"); +DEFINE_string(host, "127.0.0.1", "server ip"); +DEFINE_string(type, "fill", "fill/read/write"); + +constexpr int CONCURRENCY = 32; +constexpr int MAX_KEY_NUM = 10'000; + +int gen_random_key() { + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dist(0, MAX_KEY_NUM - 1); + return dist(gen); +} + +void run_perf(photon::net::EndPoint ep, photon::rpc::StubPool* pool) { + int ret; + auto stub = pool->get_stub(ep, false); + DEFER(pool->put_stub(ep, ret < 0)); + + while (true) { + Echo::Request req; + std::string key = std::to_string(gen_random_key()); + req.key.assign(key); + req.write = FLAGS_type == "read" ? false : true; + + Echo::Response resp; + ret = stub->call(req, resp); + if (ret < 0 || resp.ret != 0) abort(); + } +} + +void run_fill(photon::net::EndPoint ep, photon::rpc::StubPool* pool) { + int ret; + auto stub = pool->get_stub(ep, false); + DEFER(pool->put_stub(ep, ret < 0)); + + for (int i = 0; i < MAX_KEY_NUM; ++i) { + Echo::Request req; + std::string key = std::to_string(i); + req.key.assign(key); + req.write = FLAGS_type == "read" ? false : true; + + Echo::Response resp; + ret = stub->call(req, resp); + if (ret < 0 || resp.ret != 0) abort(); + } +} + +int main(int argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + + if (photon::init(photon::INIT_EVENT_IOURING, photon::INIT_IO_NONE)) + LOG_ERROR_RETURN(0, -1, "fail to init photon"); + DEFER(photon::fini()); + + auto ep = photon::net::EndPoint(photon::net::IPAddr(FLAGS_host.c_str()), + FLAGS_port); + + auto pool = photon::rpc::new_stub_pool(-1, -1, -1); + + if (FLAGS_type == "fill") { + run_fill(ep, pool); + } else { + for (int i = 0; i < CONCURRENCY; ++i) { + photon::thread_create11(run_perf, ep, pool); + } + photon::thread_sleep(-1); + } +} \ No newline at end of file diff --git a/examples/perf/perf-server.cpp b/examples/perf/perf-server.cpp new file mode 100644 index 00000000000..30778baaf30 --- /dev/null +++ b/examples/perf/perf-server.cpp @@ -0,0 +1,308 @@ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "protocol.h" + +DEFINE_int32(port, 9527, "Server listen port"); +DEFINE_int32(show_qps_interval, 10, "interval seconds to show qps"); +DEFINE_int32(vcpu_num, 8, "vcpu number"); +DEFINE_bool(create_new_db, false, "create new db"); + +static std::atomic qps{0}; + +static void show_qps_loop() { + while (true) { + photon::thread_sleep(FLAGS_show_qps_interval); + LOG_INFO("QPS: `", qps.load() / FLAGS_show_qps_interval); + qps = 0; + } +} + +class ExampleServer { + public: + // 协程池对性能影响巨大,如果这里将thread_pool_size降为0,即关闭协程池,则性能变为原先1/3 ~ 1/2 + explicit ExampleServer(int db_num = 1, int thread_pool_size = 65536) + : skeleton(photon::rpc::new_skeleton(true, thread_pool_size)), + server(photon::net::new_tcp_socket_server()), + m_db_num(db_num) { + skeleton->register_service(this); + writeOptions.sync = true; + db_sharding.resize(db_num); + LOG_INFO(VALUE(m_db_num)); + } + + virtual int do_rpc_service(Echo::Request* req, Echo::Response* resp, + IOVector*, IStream*) { + photon_std::this_thread::migrate(); + rocksdb::Status s; + std::string val; + rocksdb::DB* db = db_sharding[std::stoi(req->key.to_std()) % m_db_num]; + if (req->write) { + s = db->Put(writeOptions, + rocksdb::Slice(req->key.c_str(), req->key.size()), "1"); + } else { + s = db->Get(readOptions, + rocksdb::Slice(req->key.c_str(), req->key.size()), &val); + if (val != "1") { + LOG_ERROR("read value error"); + abort(); + } + } + if (!s.ok()) { + LOG_ERROR("db error"); + abort(); + } + resp->ret = 0; + qps++; + return 0; + } + + int serve(photon::net::ISocketStream* stream) { + return skeleton->serve(stream, false); + } + + int run(int port) { + // Optimize RocksDB. This is the easiest way to get RocksDB to perform well + options.IncreaseParallelism(); + options.OptimizeLevelStyleCompaction(); + + options.stats_dump_period_sec = 0; + options.stats_persist_period_sec = 0; + options.enable_pipelined_write = true; + options.compression = rocksdb::CompressionType::kLZ4Compression; + // create the DB if it's not already present + options.create_if_missing = true; + + if (open_db()) return -1; + + server->set_handler({this, &ExampleServer::serve}); + server->setsockopt(SOL_SOCKET, SO_REUSEPORT, 1); + if (server->bind(port) < 0) + LOG_ERRNO_RETURN(0, -1, "Failed to bind port `", port) + if (server->listen() < 0) LOG_ERRNO_RETURN(0, -1, "Failed to listen"); + LOG_INFO("Started rpc server at `", server->getsockname()); + return server->start_loop(true); + } + + protected: + static constexpr const char* db_dir = "perf-db"; + + std::unique_ptr skeleton{}; + std::unique_ptr server{}; + std::vector db_sharding{}; // 在一个server里open多个db + rocksdb::Options options; + rocksdb::WriteOptions writeOptions; + rocksdb::ReadOptions readOptions; + int m_db_num; + + virtual int open_db() { + for (int i = 0; i < m_db_num; ++i) { + if (open_db_at_index(i)) + abort(); + } + return 0; + } + + virtual int open_db_at_index(int index) { + std::string path = std::string(get_current_dir_name()) + "/" + + std::string(db_dir) + "-" + std::to_string(index); + if (FLAGS_create_new_db) { + system((std::string("rm -rf ") + path).c_str()); + LOG_INFO("Create new db at `", path.c_str()); + } else { + LOG_INFO("Open db at `", path.c_str()); + } + rocksdb::Status s = rocksdb::DB::Open(options, path, &db_sharding[index]); + if (!s.ok()) { + LOG_ERROR_RETURN(0, -1, "open db ` failed:`", index, s.ToString().c_str()); + } + return 0; + } +}; + +class ExampleServerWithNativeRocksdb : public ExampleServer { + public: + // 同步线程模式下,线程数量需要设置大一点。可以用taskset限制程序的cpu数量等于协程的vcpu数 + explicit ExampleServerWithNativeRocksdb() + : pool(new photon::WorkPool(256, photon::INIT_EVENT_IOURING, 0)), + ExampleServer() { + + } + int do_rpc_service(Echo::Request* req, Echo::Response* resp, IOVector*, + IStream*) override { + // 使用work pool进行同步线程调用 + pool->call([&] { + rocksdb::Status s; + std::string val; + rocksdb::DB* db = db_sharding[std::stoi(req->key.to_std()) % m_db_num]; + if (req->write) { + s = db->Put(writeOptions, + rocksdb::Slice(req->key.c_str(), req->key.size()), "1"); + } else { + s = db->Get(readOptions, + rocksdb::Slice(req->key.c_str(), req->key.size()), &val); + if (val != "1") abort(); + } + }); + resp->ret = 0; + qps++; + return 0; + } + + private: + photon::WorkPool* pool; +}; + +class MultiExampleServer : public ExampleServer { + public: + explicit MultiExampleServer(int index) + : m_index(index), ExampleServer() { + } + + int do_rpc_service(Echo::Request* req, Echo::Response* resp, IOVector*, + IStream*) override { + rocksdb::Status s; + std::string val; + if (req->write) { + s = db_alone->Put(writeOptions, + rocksdb::Slice(req->key.c_str(), req->key.size()), "1"); + } else { + s = db_alone->Get(readOptions, + rocksdb::Slice(req->key.c_str(), req->key.size()), &val); + if (val != "1") abort(); + } + resp->ret = 0; + qps++; + return 0; + } + + private: + int m_index; + rocksdb::DB* db_alone = nullptr; // 每个server配一个db + + int open_db() override { + std::string path = std::string(get_current_dir_name()) + "/" + + std::string(db_dir) + "-" + std::to_string(m_index); + system((std::string("rm -rf ") + path).c_str()); + LOG_INFO("Create new db at `", path.c_str()); + rocksdb::Status s = rocksdb::DB::Open(options, path, &db_alone); + if (!s.ok()) { + LOG_ERROR_RETURN(0, -1, "open db failed"); + } + return 0; + } +}; + +class MultiDBExampleServer : public ExampleServer { + public: + explicit MultiDBExampleServer(int db_num) : ExampleServer(db_num) {} + + int do_rpc_service(Echo::Request* req, Echo::Response* resp, IOVector*, + IStream*) override { + rocksdb::Status s; + std::string val; + size_t index = std::stoi(req->key.to_std()) % m_db_num; + + // TODO: modify photon + // photon_std::this_thread::migrate(index); + + rocksdb::DB* db = db_sharding[index]; + if (req->write) { + s = db->Put(writeOptions, + rocksdb::Slice(req->key.c_str(), req->key.size()), "1"); + } else { + s = db->Get(readOptions, + rocksdb::Slice(req->key.c_str(), req->key.size()), &val); + if (val != "1") { + LOG_ERROR("read value error"); + abort(); + } + } + if (!s.ok()) { + LOG_ERROR("db error"); + abort(); + } + resp->ret = 0; + qps++; + return 0; + } + + private: + int open_db() override { + for (int i = 0; i < m_db_num; ++i) { + photon::thread_create11(&MultiDBExampleServer::open_db_at_index, this, i); + } + return 0; + } + + int open_db_at_index(int index) override { + // TODO modify photon + // photon_std::this_thread::migrate(index); + LOG_INFO("Open db ` in vcpu `", index, photon::get_vcpu()); + return ExampleServer::open_db_at_index(index); + } +}; + +// 单server,用thread_migrate迁移到多vcpu +void test_single_server() { + photon_std::work_pool_init(FLAGS_vcpu_num, photon::INIT_EVENT_IOURING, 0); + auto server = new ExampleServer(); + server->run(FLAGS_port); +} + +// 单server,原生多线程版本db +void test_single_server_with_native_rocksdb() { + auto server = new ExampleServerWithNativeRocksdb(); + server->run(FLAGS_port); +} + +// 多server监听同一端口,让内核来分发连接,每个vcpu有一个server,每个server一个db实例 +// 需要修改std-compat.h,让rocksdb内部的thread不会自动迁移 +void test_multiple_servers() { + for (int i = 0; i < FLAGS_vcpu_num; ++i) { + new std::thread([i] { + photon::init(photon::INIT_EVENT_IOURING, 0); + auto server = new MultiExampleServer(i); + server->run(FLAGS_port); + photon::thread_sleep(-1); + }); + } + photon::thread_sleep(-1); +} + +// 一个server,open多个db。每个db只处理自己vcpu上的读请求,不跨vcpu +void test_multi_db_server() { + photon_std::work_pool_init(FLAGS_vcpu_num, photon::INIT_EVENT_IOURING, 0); + auto server = new MultiDBExampleServer(FLAGS_vcpu_num); + server->run(FLAGS_port); +} + +int main(int argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + if (photon::init(photon::INIT_EVENT_IOURING, photon::INIT_IO_NONE)) + LOG_ERROR_RETURN(0, -1, "fail to init photon"); + DEFER(photon::fini()); + + photon::thread_create11(show_qps_loop); + + test_single_server(); + // test_single_server_with_native_rocksdb(); + // test_multiple_servers(); + // test_multi_db_server(); +} diff --git a/examples/perf/protocol.h b/examples/perf/protocol.h new file mode 100644 index 00000000000..1cdd964873d --- /dev/null +++ b/examples/perf/protocol.h @@ -0,0 +1,23 @@ +#pragma once + +#include + +#include + +struct Echo { + const static uint32_t IID = 1; + const static uint32_t FID = 2; + + struct Request : public photon::rpc::Message { + photon::rpc::string key; + bool write; + + PROCESS_FIELDS(key, write); + }; + + struct Response : public photon::rpc::Message { + int32_t ret; + + PROCESS_FIELDS(ret); + }; +}; \ No newline at end of file diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 4d3a96fe288..e79dcecbd9c 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -1454,4 +1454,21 @@ Status NewHdfsEnv(Env** hdfs_env, const std::string& fsname); // This is a factory method for TimedEnv defined in utilities/env_timed.cc. Env* NewTimedEnv(Env* base_env); +class PhotonEnv { + public: + static PhotonEnv& Singleton() { + static PhotonEnv instance; + return instance; + } + + PhotonEnv(PhotonEnv const&) = delete; + PhotonEnv(PhotonEnv&&) = delete; + PhotonEnv& operator=(PhotonEnv const&) = delete; + PhotonEnv& operator=(PhotonEnv&&) = delete; + + private: + PhotonEnv(); + ~PhotonEnv(); +}; + } // namespace rocksdb diff --git a/memtable/inlineskiplist_test.cc b/memtable/inlineskiplist_test.cc index b416ef7c557..651a97b82c9 100644 --- a/memtable/inlineskiplist_test.cc +++ b/memtable/inlineskiplist_test.cc @@ -640,6 +640,7 @@ TEST_F(InlineSkipTest, ConcurrentInsert3) { RunConcurrentInsert(3); } } // namespace rocksdb int main(int argc, char** argv) { + rocksdb::PhotonEnv::Singleton(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/memtable/skiplist_test.cc b/memtable/skiplist_test.cc index 50c3588bb86..1cfdcb411ef 100644 --- a/memtable/skiplist_test.cc +++ b/memtable/skiplist_test.cc @@ -383,6 +383,7 @@ TEST_F(SkipTest, Concurrent5) { RunConcurrent(5); } } // namespace rocksdb int main(int argc, char** argv) { + rocksdb::PhotonEnv::Singleton(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/monitoring/iostats_context.cc b/monitoring/iostats_context.cc index 3d102f91203..5534853d64f 100644 --- a/monitoring/iostats_context.cc +++ b/monitoring/iostats_context.cc @@ -10,12 +10,12 @@ namespace rocksdb { #ifdef ROCKSDB_SUPPORT_THREAD_LOCAL -__thread IOStatsContext iostats_context; +photon::thread_local_ptr iostats_context; #endif IOStatsContext* get_iostats_context() { #ifdef ROCKSDB_SUPPORT_THREAD_LOCAL - return &iostats_context; + return iostats_context.operator->(); #else return nullptr; #endif diff --git a/monitoring/iostats_context_imp.h b/monitoring/iostats_context_imp.h index 23c2088cab2..80a84d98502 100644 --- a/monitoring/iostats_context_imp.h +++ b/monitoring/iostats_context_imp.h @@ -9,38 +9,38 @@ #ifdef ROCKSDB_SUPPORT_THREAD_LOCAL namespace rocksdb { -extern __thread IOStatsContext iostats_context; +extern photon::thread_local_ptr iostats_context; } // namespace rocksdb // increment a specific counter by the specified value -#define IOSTATS_ADD(metric, value) (iostats_context.metric += value) +#define IOSTATS_ADD(metric, value) (iostats_context->metric += value) // Increase metric value only when it is positive #define IOSTATS_ADD_IF_POSITIVE(metric, value) \ if (value > 0) { IOSTATS_ADD(metric, value); } // reset a specific counter to zero -#define IOSTATS_RESET(metric) (iostats_context.metric = 0) +#define IOSTATS_RESET(metric) (iostats_context->metric = 0) // reset all counters to zero -#define IOSTATS_RESET_ALL() (iostats_context.Reset()) +#define IOSTATS_RESET_ALL() (iostats_context->Reset()) #define IOSTATS_SET_THREAD_POOL_ID(value) \ - (iostats_context.thread_pool_id = value) + (iostats_context->thread_pool_id = value) -#define IOSTATS_THREAD_POOL_ID() (iostats_context.thread_pool_id) +#define IOSTATS_THREAD_POOL_ID() (iostats_context->thread_pool_id) -#define IOSTATS(metric) (iostats_context.metric) +#define IOSTATS(metric) (iostats_context->metric) // Declare and set start time of the timer #define IOSTATS_TIMER_GUARD(metric) \ - PerfStepTimer iostats_step_timer_##metric(&(iostats_context.metric)); \ + PerfStepTimer iostats_step_timer_##metric(&(iostats_context->metric)); \ iostats_step_timer_##metric.Start(); // Declare and set start time of the timer #define IOSTATS_CPU_TIMER_GUARD(metric, env) \ PerfStepTimer iostats_step_timer_##metric( \ - &(iostats_context.metric), env, true, \ + &(iostats_context->metric), env, true, \ PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); \ iostats_step_timer_##metric.Start(); diff --git a/monitoring/iostats_context_test.cc b/monitoring/iostats_context_test.cc index 74d3e43291d..daf03ea083c 100644 --- a/monitoring/iostats_context_test.cc +++ b/monitoring/iostats_context_test.cc @@ -24,6 +24,7 @@ TEST(IOStatsContextTest, ToString) { } // namespace rocksdb int main(int argc, char** argv) { + rocksdb::PhotonEnv::Singleton(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/monitoring/perf_context.cc b/monitoring/perf_context.cc index 40b0b215c47..bdc1fb06c06 100644 --- a/monitoring/perf_context.cc +++ b/monitoring/perf_context.cc @@ -15,7 +15,7 @@ PerfContext perf_context; #if defined(OS_SOLARIS) __thread PerfContext perf_context_; #else -thread_local PerfContext perf_context; +photon::thread_local_ptr perf_context; #endif #endif @@ -26,7 +26,7 @@ PerfContext* get_perf_context() { #if defined(OS_SOLARIS) return &perf_context_; #else - return &perf_context; + return perf_context.operator->(); #endif #endif } diff --git a/monitoring/perf_context_imp.h b/monitoring/perf_context_imp.h index e0ff8afc58e..a0d4b89c948 100644 --- a/monitoring/perf_context_imp.h +++ b/monitoring/perf_context_imp.h @@ -16,7 +16,7 @@ extern PerfContext perf_context; extern __thread PerfContext perf_context_; #define perf_context (*get_perf_context()) #else -extern thread_local PerfContext perf_context; +extern photon::thread_local_ptr perf_context; #endif #endif @@ -38,24 +38,24 @@ extern thread_local PerfContext perf_context; // Declare and set start time of the timer #define PERF_TIMER_GUARD(metric) \ - PerfStepTimer perf_step_timer_##metric(&(perf_context.metric)); \ + PerfStepTimer perf_step_timer_##metric(&(perf_context->metric)); \ perf_step_timer_##metric.Start(); // Declare and set start time of the timer #define PERF_TIMER_GUARD_WITH_ENV(metric, env) \ - PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), env); \ + PerfStepTimer perf_step_timer_##metric(&(perf_context->metric), env); \ perf_step_timer_##metric.Start(); // Declare and set start time of the timer #define PERF_CPU_TIMER_GUARD(metric, env) \ PerfStepTimer perf_step_timer_##metric( \ - &(perf_context.metric), env, true, \ + &(perf_context->metric), env, true, \ PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); \ perf_step_timer_##metric.Start(); #define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition, stats, \ ticker_type) \ - PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), nullptr, \ + PerfStepTimer perf_step_timer_##metric(&(perf_context->metric), nullptr, \ false, PerfLevel::kEnableTime, stats, \ ticker_type); \ if (condition) { \ @@ -68,23 +68,23 @@ extern thread_local PerfContext perf_context; // Increase metric value #define PERF_COUNTER_ADD(metric, value) \ - if (perf_level >= PerfLevel::kEnableCount) { \ - perf_context.metric += value; \ + if (*perf_level >= PerfLevel::kEnableCount) { \ + perf_context->metric += value; \ } // Increase metric value #define PERF_COUNTER_BY_LEVEL_ADD(metric, value, level) \ - if (perf_level >= PerfLevel::kEnableCount && \ - perf_context.per_level_perf_context_enabled && \ - perf_context.level_to_perf_context) { \ - if ((*(perf_context.level_to_perf_context)).find(level) != \ - (*(perf_context.level_to_perf_context)).end()) { \ - (*(perf_context.level_to_perf_context))[level].metric += value; \ + if (*perf_level >= PerfLevel::kEnableCount && \ + perf_context->per_level_perf_context_enabled && \ + perf_context->level_to_perf_context) { \ + if ((*(perf_context->level_to_perf_context)).find(level) != \ + (*(perf_context->level_to_perf_context)).end()) { \ + (*(perf_context->level_to_perf_context))[level].metric += value; \ } \ else { \ PerfContextByLevel empty_context; \ - (*(perf_context.level_to_perf_context))[level] = empty_context; \ - (*(perf_context.level_to_perf_context))[level].metric += value; \ + (*(perf_context->level_to_perf_context))[level] = empty_context; \ + (*(perf_context->level_to_perf_context))[level].metric += value; \ } \ } \ diff --git a/monitoring/perf_level.cc b/monitoring/perf_level.cc index 79c718cce76..527f6d43490 100644 --- a/monitoring/perf_level.cc +++ b/monitoring/perf_level.cc @@ -10,7 +10,7 @@ namespace rocksdb { #ifdef ROCKSDB_SUPPORT_THREAD_LOCAL -__thread PerfLevel perf_level = kEnableCount; +photon::thread_local_ptr perf_level(kEnableCount); #else PerfLevel perf_level = kEnableCount; #endif @@ -18,11 +18,11 @@ PerfLevel perf_level = kEnableCount; void SetPerfLevel(PerfLevel level) { assert(level > kUninitialized); assert(level < kOutOfBounds); - perf_level = level; + *perf_level = level; } PerfLevel GetPerfLevel() { - return perf_level; + return *perf_level; } } // namespace rocksdb diff --git a/monitoring/perf_level_imp.h b/monitoring/perf_level_imp.h index 2a3add19cee..6e56c79e1cf 100644 --- a/monitoring/perf_level_imp.h +++ b/monitoring/perf_level_imp.h @@ -10,7 +10,7 @@ namespace rocksdb { #ifdef ROCKSDB_SUPPORT_THREAD_LOCAL -extern __thread PerfLevel perf_level; +extern photon::thread_local_ptr perf_level; #else extern PerfLevel perf_level; #endif diff --git a/monitoring/perf_step_timer.h b/monitoring/perf_step_timer.h index 6501bd54aba..35a416e30cc 100644 --- a/monitoring/perf_step_timer.h +++ b/monitoring/perf_step_timer.h @@ -16,7 +16,7 @@ class PerfStepTimer { uint64_t* metric, Env* env = nullptr, bool use_cpu_time = false, PerfLevel enable_level = PerfLevel::kEnableTimeExceptForMutex, Statistics* statistics = nullptr, uint32_t ticker_type = 0) - : perf_counter_enabled_(perf_level >= enable_level), + : perf_counter_enabled_((*perf_level) >= enable_level), use_cpu_time_(use_cpu_time), env_((perf_counter_enabled_ || statistics != nullptr) ? ((env != nullptr) ? env : Env::Default()) diff --git a/monitoring/thread_status_updater.cc b/monitoring/thread_status_updater.cc index cde44928b62..408d509a6d3 100644 --- a/monitoring/thread_status_updater.cc +++ b/monitoring/thread_status_updater.cc @@ -13,7 +13,8 @@ namespace rocksdb { #ifdef ROCKSDB_USING_THREAD_STATUS -__thread ThreadStatusData* ThreadStatusUpdater::thread_status_data_ = nullptr; +photon::thread_local_ptr ThreadStatusUpdater::thread_status_data_ptr_(nullptr); +#define thread_status_data_ (*thread_status_data_ptr_) void ThreadStatusUpdater::RegisterThread(ThreadStatus::ThreadType ttype, uint64_t thread_id) { diff --git a/monitoring/thread_status_updater.h b/monitoring/thread_status_updater.h index 6706d159dfb..a9fec89521c 100644 --- a/monitoring/thread_status_updater.h +++ b/monitoring/thread_status_updater.h @@ -196,7 +196,7 @@ class ThreadStatusUpdater { protected: #ifdef ROCKSDB_USING_THREAD_STATUS // The thread-local variable for storing thread status. - static __thread ThreadStatusData* thread_status_data_; + static photon::thread_local_ptr thread_status_data_ptr_; // Returns the pointer to the thread status data only when the // thread status data is non-null and has enable_tracking == true. @@ -205,7 +205,7 @@ class ThreadStatusUpdater { // Directly returns the pointer to thread_status_data_ without // checking whether enabling_tracking is true of not. ThreadStatusData* Get() { - return thread_status_data_; + return *thread_status_data_ptr_; } // The mutex that protects cf_info_map and db_key_map. diff --git a/monitoring/thread_status_util.cc b/monitoring/thread_status_util.cc index c2af0a57454..9782b15048a 100644 --- a/monitoring/thread_status_util.cc +++ b/monitoring/thread_status_util.cc @@ -11,9 +11,10 @@ namespace rocksdb { #ifdef ROCKSDB_USING_THREAD_STATUS -__thread ThreadStatusUpdater* ThreadStatusUtil::thread_updater_local_cache_ = - nullptr; -__thread bool ThreadStatusUtil::thread_updater_initialized_ = false; +photon::thread_local_ptr ThreadStatusUtil::thread_updater_local_cache_ptr_(nullptr); +photon::thread_local_ptr ThreadStatusUtil::thread_updater_initialized_ptr_(false); +#define thread_updater_initialized_ (*thread_updater_initialized_ptr_) +#define thread_updater_local_cache_ (*thread_updater_local_cache_ptr_) void ThreadStatusUtil::RegisterThread(const Env* env, ThreadStatus::ThreadType thread_type) { diff --git a/monitoring/thread_status_util.h b/monitoring/thread_status_util.h index a403435c3d0..c455b0a1fb9 100644 --- a/monitoring/thread_status_util.h +++ b/monitoring/thread_status_util.h @@ -94,7 +94,7 @@ class ThreadStatusUtil { // When this variable is set to true, thread_updater_local_cache_ // will not be updated until this variable is again set to false // in UnregisterThread(). - static __thread bool thread_updater_initialized_; + static photon::thread_local_ptr thread_updater_initialized_ptr_; // The thread-local cached ThreadStatusUpdater that caches the // thread_status_updater_ of the first Env that uses any ThreadStatusUtil @@ -109,7 +109,8 @@ class ThreadStatusUtil { // When thread_updater_initialized_ is set to true, this variable // will not be updated until this thread_updater_initialized_ is // again set to false in UnregisterThread(). - static __thread ThreadStatusUpdater* thread_updater_local_cache_; + static photon::thread_local_ptr thread_updater_local_cache_ptr_; + #else static bool thread_updater_initialized_; static ThreadStatusUpdater* thread_updater_local_cache_; diff --git a/photon-auto-convert.sh b/photon-auto-convert.sh new file mode 100755 index 00000000000..49a9e525db4 --- /dev/null +++ b/photon-auto-convert.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -e + +cc_files=$(find . -type f -name "*.cc" -not -path "./build/*") +h_files=$(find . -type f -name "*.h" -not -path "./build/*") +files="${cc_files} ${h_files}" + +sed -i 's|#include |#include "port/port.h"|g' $files +sed -i 's|#include |#include "port/port.h"|g' $files +sed -i 's|#include |#include "port/port.h"|g' $files +sed -i 's/std::mutex/photon_std::mutex/g' $files +sed -i 's/std::condition_variable/photon_std::condition_variable/g' $files +sed -i 's/std::lock_guard/photon_std::lock_guard/g' $files +sed -i 's/std::unique_lock/photon_std::unique_lock/g' $files +sed -i 's/std::thread/photon_std::thread/g' $files +sed -i 's/std::this_thread/photon_std::this_thread/g' $files diff --git a/photon-bench.md b/photon-bench.md new file mode 100644 index 00000000000..15f80df5a34 --- /dev/null +++ b/photon-bench.md @@ -0,0 +1,30 @@ +## Build + +```bash +# Auto convert code +./photon-auto-convert.sh + +# Compile +cmake -B build -D FAIL_ON_WARNINGS=off -D WITH_LZ4=on -D CMAKE_BUILD_TYPE=Release +cmake --build build -t db_bench -j + +# Run benchmark +cd build +cp ../tools/benchmark.sh . +export DB_DIR=`pwd`/test-db +export WAL_DIR=$DB_DIR +export OUTPUT_DIR=$DB_DIR +export COMPRESSION_TYPE=lz4 +export NUM_KEYS=100000000 # Require 14 GB disk space + +# Edit benchmark.sh, add `taskset -c 1,8` before the ./db_bench command. +# This would limit the CPU number for both thread and coroutine, in order to make a fair comparison. + +# Clean page cache before every test +echo 3 > /proc/sys/vm/drop_caches + +./benchmark.sh bulkload # Generate data +./benchmark.sh readrandom # Read test +./benchmark.sh overwrite # Overwrite test (sync = 0) +./benchmark.sh updaterandom # Update test (read first, then write, sync = 1) +``` diff --git a/photon.md b/photon.md new file mode 100644 index 00000000000..7b5857b4d86 --- /dev/null +++ b/photon.md @@ -0,0 +1,13 @@ +## Build + +```bash +# Build performance test +./photon-auto-convert.sh +cmake -B build -D INIT_PHOTON_IN_ROCKSDB=off -D FAIL_ON_WARNINGS=off -D WITH_LZ4=on -D WITH_SNAPPY=on -D CMAKE_BUILD_TYPE=Release +cmake --build build -t perf-client -t perf-server -j + +# Build CI tests and db_bench +./photon-auto-convert.sh +cmake -B build -D INIT_PHOTON_IN_ROCKSDB=on -D FAIL_ON_WARNINGS=off -D WITH_LZ4=on -D WITH_SNAPPY=on -D CMAKE_BUILD_TYPE=Release +cmake --build build -t db_bench -j +``` diff --git a/port/port_posix.cc b/port/port_posix.cc index 80081e480e0..1cbc41e0935 100644 --- a/port/port_posix.cc +++ b/port/port_posix.cc @@ -51,29 +51,12 @@ static int PthreadCall(const char* label, int result) { } Mutex::Mutex(bool adaptive) { - (void) adaptive; -#ifdef ROCKSDB_PTHREAD_ADAPTIVE_MUTEX - if (!adaptive) { - PthreadCall("init mutex", pthread_mutex_init(&mu_, nullptr)); - } else { - pthread_mutexattr_t mutex_attr; - PthreadCall("init mutex attr", pthread_mutexattr_init(&mutex_attr)); - PthreadCall("set mutex attr", - pthread_mutexattr_settype(&mutex_attr, - PTHREAD_MUTEX_ADAPTIVE_NP)); - PthreadCall("init mutex", pthread_mutex_init(&mu_, &mutex_attr)); - PthreadCall("destroy mutex attr", - pthread_mutexattr_destroy(&mutex_attr)); - } -#else - PthreadCall("init mutex", pthread_mutex_init(&mu_, nullptr)); -#endif // ROCKSDB_PTHREAD_ADAPTIVE_MUTEX } -Mutex::~Mutex() { PthreadCall("destroy mutex", pthread_mutex_destroy(&mu_)); } +Mutex::~Mutex() { } void Mutex::Lock() { - PthreadCall("lock", pthread_mutex_lock(&mu_)); + mu_.lock(); #ifndef NDEBUG locked_ = true; #endif @@ -83,7 +66,7 @@ void Mutex::Unlock() { #ifndef NDEBUG locked_ = false; #endif - PthreadCall("unlock", pthread_mutex_unlock(&mu_)); + mu_.unlock(); } void Mutex::AssertHeld() { @@ -94,63 +77,57 @@ void Mutex::AssertHeld() { CondVar::CondVar(Mutex* mu) : mu_(mu) { - PthreadCall("init cv", pthread_cond_init(&cv_, nullptr)); } -CondVar::~CondVar() { PthreadCall("destroy cv", pthread_cond_destroy(&cv_)); } +CondVar::~CondVar() {} void CondVar::Wait() { #ifndef NDEBUG mu_->locked_ = false; #endif - PthreadCall("wait", pthread_cond_wait(&cv_, &mu_->mu_)); + cv_.wait(mu_->mu_); #ifndef NDEBUG mu_->locked_ = true; #endif } bool CondVar::TimedWait(uint64_t abs_time_us) { - struct timespec ts; - ts.tv_sec = static_cast(abs_time_us / 1000000); - ts.tv_nsec = static_cast((abs_time_us % 1000000) * 1000); - #ifndef NDEBUG mu_->locked_ = false; #endif - int err = pthread_cond_timedwait(&cv_, &mu_->mu_, &ts); + auto abs_now_us = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); + uint64_t timeout = abs_time_us > abs_now_us ? abs_time_us - abs_now_us : 0; + int ret = cv_.wait(mu_->mu_, timeout); #ifndef NDEBUG mu_->locked_ = true; #endif - if (err == ETIMEDOUT) { + if (ret != 0 || timeout == 0) { return true; } - if (err != 0) { - PthreadCall("timedwait", err); - } return false; } void CondVar::Signal() { - PthreadCall("signal", pthread_cond_signal(&cv_)); + cv_.notify_one(); } void CondVar::SignalAll() { - PthreadCall("broadcast", pthread_cond_broadcast(&cv_)); + cv_.notify_all(); } RWMutex::RWMutex() { - PthreadCall("init mutex", pthread_rwlock_init(&mu_, nullptr)); } -RWMutex::~RWMutex() { PthreadCall("destroy mutex", pthread_rwlock_destroy(&mu_)); } +RWMutex::~RWMutex() { } -void RWMutex::ReadLock() { PthreadCall("read lock", pthread_rwlock_rdlock(&mu_)); } +void RWMutex::ReadLock() { mu_.lock(photon::RLOCK); } -void RWMutex::WriteLock() { PthreadCall("write lock", pthread_rwlock_wrlock(&mu_)); } +void RWMutex::WriteLock() { mu_.lock(photon::WLOCK); } -void RWMutex::ReadUnlock() { PthreadCall("read unlock", pthread_rwlock_unlock(&mu_)); } +void RWMutex::ReadUnlock() { mu_.unlock(); } -void RWMutex::WriteUnlock() { PthreadCall("write unlock", pthread_rwlock_unlock(&mu_)); } +void RWMutex::WriteUnlock() { mu_.unlock(); } int PhysicalCoreID() { #if defined(ROCKSDB_SCHED_GETCPU_PRESENT) && defined(__x86_64__) && \ @@ -175,10 +152,6 @@ int PhysicalCoreID() { #endif } -void InitOnce(OnceType* once, void (*initializer)()) { - PthreadCall("once", pthread_once(once, initializer)); -} - void Crash(const std::string& srcfile, int srcline) { fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline); fflush(stdout); diff --git a/port/port_posix.h b/port/port_posix.h index 63d7239fe6d..9bbf2053f19 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -11,7 +11,11 @@ #pragma once -#include +#include +#include +#include +#include +#include // size_t printf formatting named in the manner of C99 standard formatting // strings such as PRIu64 // in fact, we could use that one @@ -49,7 +53,6 @@ #else #include #endif -#include #include #include @@ -114,7 +117,7 @@ class Mutex { private: friend class CondVar; - pthread_mutex_t mu_; + photon::mutex mu_; #ifndef NDEBUG bool locked_; #endif @@ -136,7 +139,7 @@ class RWMutex { void AssertHeld() { } private: - pthread_rwlock_t mu_; // the underlying platform mutex + photon::rwlock mu_; // the underlying platform mutex // No copying allowed RWMutex(const RWMutex&); @@ -153,7 +156,7 @@ class CondVar { void Signal(); void SignalAll(); private: - pthread_cond_t cv_; + photon::condition_variable cv_; Mutex* mu_; }; @@ -173,10 +176,6 @@ static inline void AsmVolatilePause() { // Returns -1 if not available on this platform extern int PhysicalCoreID(); -typedef pthread_once_t OnceType; -#define LEVELDB_ONCE_INIT PTHREAD_ONCE_INIT -extern void InitOnce(OnceType* once, void (*initializer)()); - #ifndef CACHE_LINE_SIZE #if defined(__s390__) #define CACHE_LINE_SIZE 256U diff --git a/table/block_based_filter_block_test.cc b/table/block_based_filter_block_test.cc index 6b352b2f6b0..ae06d7e727a 100644 --- a/table/block_based_filter_block_test.cc +++ b/table/block_based_filter_block_test.cc @@ -243,6 +243,7 @@ TEST_F(BlockBasedFilterBlockTest, BlockBasedMultiChunk) { } // namespace rocksdb int main(int argc, char** argv) { + rocksdb::PhotonEnv::Singleton(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/table/full_filter_block_test.cc b/table/full_filter_block_test.cc index f01ae52bf7d..e74b5931cd8 100644 --- a/table/full_filter_block_test.cc +++ b/table/full_filter_block_test.cc @@ -217,6 +217,7 @@ TEST_F(FullFilterBlockTest, SingleChunk) { } // namespace rocksdb int main(int argc, char** argv) { + rocksdb::PhotonEnv::Singleton(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/table/merger_test.cc b/table/merger_test.cc index 1b04d065727..da7b6ca3776 100644 --- a/table/merger_test.cc +++ b/table/merger_test.cc @@ -175,6 +175,7 @@ TEST_F(MergerTest, SeekToLastTest) { } // namespace rocksdb int main(int argc, char** argv) { + rocksdb::PhotonEnv::Singleton(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 0cb4e0eb27e..ddccadf2904 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -2947,7 +2947,7 @@ void VerifyDBFromDB(std::string& truth_db_name) { } SetPerfLevel(static_cast (shared->perf_level)); - perf_context.EnablePerLevelPerfContext(); + perf_context->EnablePerLevelPerfContext(); thread->stats.Start(thread->tid); (arg->bm->*(arg->method))(thread); thread->stats.Stop(); diff --git a/util/concurrent_arena.cc b/util/concurrent_arena.cc index cef77d7e75f..3b53e9fc9e3 100644 --- a/util/concurrent_arena.cc +++ b/util/concurrent_arena.cc @@ -15,7 +15,7 @@ namespace rocksdb { #ifdef ROCKSDB_SUPPORT_THREAD_LOCAL -__thread size_t ConcurrentArena::tls_cpuid = 0; +photon::thread_local_ptr ConcurrentArena::tls_cpuid(0); #endif namespace { @@ -39,7 +39,7 @@ ConcurrentArena::Shard* ConcurrentArena::Repick() { #ifdef ROCKSDB_SUPPORT_THREAD_LOCAL // even if we are cpu 0, use a non-zero tls_cpuid so we can tell we // have repicked - tls_cpuid = shard_and_index.second | shards_.Size(); + *tls_cpuid = shard_and_index.second | shards_.Size(); #endif return shard_and_index.first; } diff --git a/util/concurrent_arena.h b/util/concurrent_arena.h index a6191100fd0..6605f67f839 100644 --- a/util/concurrent_arena.h +++ b/util/concurrent_arena.h @@ -95,7 +95,7 @@ class ConcurrentArena : public Allocator { }; #ifdef ROCKSDB_SUPPORT_THREAD_LOCAL - static __thread size_t tls_cpuid; + static photon::thread_local_ptr tls_cpuid; #else enum ZeroFirstEnum : size_t { tls_cpuid = 0 }; #endif @@ -135,7 +135,7 @@ class ConcurrentArena : public Allocator { // concurrency zero unless it might actually confer an advantage. std::unique_lock arena_lock(arena_mutex_, std::defer_lock); if (bytes > shard_block_size_ / 4 || force_arena || - ((cpu = tls_cpuid) == 0 && + ((cpu = *tls_cpuid) == 0 && !shards_.AccessAtCore(0)->allocated_and_unused_.load( std::memory_order_relaxed) && arena_lock.try_lock())) { diff --git a/util/mutexlock.h b/util/mutexlock.h index 640cef3daf7..c4d4d693832 100644 --- a/util/mutexlock.h +++ b/util/mutexlock.h @@ -97,9 +97,9 @@ class WriteLock { // SpinMutex has very low overhead for low-contention cases. Method names // are chosen so you can use std::unique_lock or std::lock_guard with it. // -class SpinMutex { +class SpinMutexObsolete { public: - SpinMutex() : locked_(false) {} + SpinMutexObsolete() : locked_(false) {} bool try_lock() { auto currently_locked = locked_.load(std::memory_order_relaxed); @@ -128,4 +128,6 @@ class SpinMutex { std::atomic locked_; }; +using SpinMutex = std::mutex; + } // namespace rocksdb diff --git a/util/repeatable_thread.h b/util/repeatable_thread.h index 967cc49945e..7bd66fb51d5 100644 --- a/util/repeatable_thread.h +++ b/util/repeatable_thread.h @@ -97,16 +97,6 @@ class RepeatableThread { } void thread() { -#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) -#if __GLIBC_PREREQ(2, 12) - // Set thread name. - auto thread_handle = thread_.native_handle(); - int ret __attribute__((__unused__)) = - pthread_setname_np(thread_handle, thread_name_.c_str()); - assert(ret == 0); -#endif -#endif - assert(delay_us_ > 0); if (!wait(initial_delay_us_)) { return; diff --git a/util/thread_local.cc b/util/thread_local.cc index 7346eff11e8..21325520bdd 100644 --- a/util/thread_local.cc +++ b/util/thread_local.cc @@ -142,17 +142,17 @@ class ThreadLocalPtr::StaticMeta { port::Mutex mutex_; #ifdef ROCKSDB_SUPPORT_THREAD_LOCAL // Thread local storage - static __thread ThreadData* tls_; + static photon::thread_local_ptr tls_; #endif // Used to make thread exit trigger possible if !defined(OS_MACOSX). // Otherwise, used to retrieve thread data. - pthread_key_t pthread_key_; + photon::thread_key_t pthread_key_; }; #ifdef ROCKSDB_SUPPORT_THREAD_LOCAL -__thread ThreadData* ThreadLocalPtr::StaticMeta::tls_ = nullptr; +photon::thread_local_ptr ThreadLocalPtr::StaticMeta::tls_(nullptr); #endif // Windows doesn't support a per-thread destructor with its @@ -285,7 +285,7 @@ void ThreadLocalPtr::StaticMeta::OnThreadExit(void* ptr) { // scope here in case this OnThreadExit is called after the main thread // dies. auto* inst = tls->inst; - pthread_setspecific(inst->pthread_key_, nullptr); + photon::thread_setspecific(inst->pthread_key_, nullptr); MutexLock l(inst->MemberMutex()); inst->RemoveThreadData(tls); @@ -309,35 +309,11 @@ ThreadLocalPtr::StaticMeta::StaticMeta() : next_instance_id_(0), head_(this), pthread_key_(0) { - if (pthread_key_create(&pthread_key_, &OnThreadExit) != 0) { + if (photon::thread_key_create(&pthread_key_, &OnThreadExit) != 0) { abort(); } - // OnThreadExit is not getting called on the main thread. - // Call through the static destructor mechanism to avoid memory leak. - // - // Caveats: ~A() will be invoked _after_ ~StaticMeta for the global - // singleton (destructors are invoked in reverse order of constructor - // _completion_); the latter must not mutate internal members. This - // cleanup mechanism inherently relies on use-after-release of the - // StaticMeta, and is brittle with respect to compiler-specific handling - // of memory backing destructed statically-scoped objects. Perhaps - // registering with atexit(3) would be more robust. - // -// This is not required on Windows. -#if !defined(OS_WIN) - static struct A { - ~A() { -#ifndef ROCKSDB_SUPPORT_THREAD_LOCAL - ThreadData* tls_ = - static_cast(pthread_getspecific(Instance()->pthread_key_)); -#endif - if (tls_) { - OnThreadExit(tls_); - } - } - } a; -#endif // !defined(OS_WIN) + // Photon's thread key has already supported destruction on main thread head_.next = &head_; head_.prev = &head_; @@ -373,27 +349,27 @@ ThreadData* ThreadLocalPtr::StaticMeta::GetThreadLocal() { static_cast(pthread_getspecific(Instance()->pthread_key_)); #endif - if (UNLIKELY(tls_ == nullptr)) { + if (UNLIKELY(*tls_ == nullptr)) { auto* inst = Instance(); - tls_ = new ThreadData(inst); + *tls_ = new ThreadData(inst); { // Register it in the global chain, needs to be done before thread exit // handler registration MutexLock l(Mutex()); - inst->AddThreadData(tls_); + inst->AddThreadData(*tls_); } // Even it is not OS_MACOSX, need to register value for pthread_key_ so that // its exit handler will be triggered. - if (pthread_setspecific(inst->pthread_key_, tls_) != 0) { + if (photon::thread_setspecific(inst->pthread_key_, *tls_) != 0) { { MutexLock l(Mutex()); - inst->RemoveThreadData(tls_); + inst->RemoveThreadData(*tls_); } - delete tls_; + delete *tls_; abort(); } } - return tls_; + return *tls_; } void* ThreadLocalPtr::StaticMeta::Get(uint32_t id) const { diff --git a/util/threadpool_imp.cc b/util/threadpool_imp.cc index acac0063bcd..1d3d4740a0f 100644 --- a/util/threadpool_imp.cc +++ b/util/threadpool_imp.cc @@ -231,31 +231,10 @@ void ThreadPoolImpl::Impl::BGThread(size_t thread_id) { #ifdef OS_LINUX if (decrease_cpu_priority) { - setpriority( - PRIO_PROCESS, - // Current thread. - 0, - // Lowest priority possible. - 19); low_cpu_priority = true; } if (decrease_io_priority) { -#define IOPRIO_CLASS_SHIFT (13) -#define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) - // Put schedule into IOPRIO_CLASS_IDLE class (lowest) - // These system calls only have an effect when used in conjunction - // with an I/O scheduler that supports I/O priorities. As at - // kernel 2.6.17 the only such scheduler is the Completely - // Fair Queuing (CFQ) I/O scheduler. - // To change scheduler: - // echo cfq > /sys/block//queue/schedule - // Tunables to consider: - // /sys/block//queue/slice_idle - // /sys/block//queue/slice_sync - syscall(SYS_ioprio_set, 1, // IOPRIO_WHO_PROCESS - 0, // current thread - IOPRIO_PRIO_VALUE(3, 0)); low_io_priority = true; } #else @@ -337,20 +316,6 @@ void ThreadPoolImpl::Impl::StartBGThreads() { port::Thread p_t(&BGThreadWrapper, new BGThreadMetadata(this, bgthreads_.size())); -// Set the thread name to aid debugging -#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) -#if __GLIBC_PREREQ(2, 12) - auto th_handle = p_t.native_handle(); - std::string thread_priority = Env::PriorityToString(GetThreadPriority()); - std::ostringstream thread_name_stream; - thread_name_stream << "rocksdb:"; - for (char c : thread_priority) { - thread_name_stream << static_cast(tolower(c)); - } - thread_name_stream << bgthreads_.size(); - pthread_setname_np(th_handle, thread_name_stream.str().c_str()); -#endif -#endif bgthreads_.push_back(std::move(p_t)); } } diff --git a/util/timer_queue_test.cc b/util/timer_queue_test.cc index 5f5f08f21bb..a564daf793b 100644 --- a/util/timer_queue_test.cc +++ b/util/timer_queue_test.cc @@ -25,6 +25,7 @@ // #include "util/timer_queue.h" +#include "rocksdb/env.h" #include namespace Timing { @@ -39,6 +40,7 @@ double now() { } // namespace Timing int main() { + rocksdb::PhotonEnv::Singleton(); TimerQueue q; double tnow = Timing::now(); diff --git a/utilities/cassandra/cassandra_functional_test.cc b/utilities/cassandra/cassandra_functional_test.cc index dacc6f03ce3..d242da91de0 100644 --- a/utilities/cassandra/cassandra_functional_test.cc +++ b/utilities/cassandra/cassandra_functional_test.cc @@ -306,6 +306,7 @@ TEST_F(CassandraFunctionalTest, CompactionShouldRemoveTombstoneFromPut) { } // namespace rocksdb int main(int argc, char** argv) { + PhotonEnv::Singleton(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/utilities/merge_operators/string_append/stringappend_test.cc b/utilities/merge_operators/string_append/stringappend_test.cc index 54c89a03abf..af725ae63d7 100644 --- a/utilities/merge_operators/string_append/stringappend_test.cc +++ b/utilities/merge_operators/string_append/stringappend_test.cc @@ -579,6 +579,7 @@ TEST_F(StringAppendOperatorTest, SimpleTestNullDelimiter) { int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); + rocksdb::PhotonEnv::Singleton(); // Run with regular database int result; { diff --git a/utilities/persistent_cache/hash_table_test.cc b/utilities/persistent_cache/hash_table_test.cc index d6ff3e68e42..71f6e43e103 100644 --- a/utilities/persistent_cache/hash_table_test.cc +++ b/utilities/persistent_cache/hash_table_test.cc @@ -156,5 +156,6 @@ TEST_F(EvictableHashTableTest, TestEvict) { int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); + rocksdb::PhotonEnv::Singleton(); return RUN_ALL_TESTS(); } diff --git a/utilities/transactions/transaction_db_mutex_impl.cc b/utilities/transactions/transaction_db_mutex_impl.cc index 244a950773c..611c47e35be 100644 --- a/utilities/transactions/transaction_db_mutex_impl.cc +++ b/utilities/transactions/transaction_db_mutex_impl.cc @@ -67,6 +67,7 @@ Status TransactionDBMutexImpl::Lock() { } Status TransactionDBMutexImpl::TryLockFor(int64_t timeout_time) { + // return mutex_.lock(timeout_time) == 0 ? Status::OK() : Status::TimedOut(Status::SubCode::kMutexTimeout); bool locked = true; if (timeout_time == 0) { diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index c0f5a10682a..8881abbb07c 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -1374,11 +1374,13 @@ TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrentTest) { if (linked == 1) { // Wait until the others are linked too. while (linked < first_group_size) { + std::this_thread::yield(); } } else if (linked == 1 + first_group_size) { // Make the 2nd batch of the rest of writes plus any followup // commits from the first batch while (linked < txn_cnt + commit_writes) { + std::this_thread::yield(); } } // Then we will have one or more batches consisting of follow-up @@ -1411,13 +1413,16 @@ TEST_P(SeqAdvanceConcurrentTest, SeqAdvanceConcurrentTest) { } // wait to be linked while (linked.load() <= bi) { + std::this_thread::yield(); } // after a queue of size first_group_size if (bi + 1 == first_group_size) { while (!batch_formed) { + std::this_thread::yield(); } // to make it more deterministic, wait until the commits are linked while (linked.load() <= bi + expected_commits) { + std::this_thread::yield(); } } } @@ -3163,6 +3168,7 @@ TEST_P(WritePreparedTransactionTest, WC_WP_WALForwardIncompatibility) { } // namespace rocksdb int main(int argc, char** argv) { + rocksdb::PhotonEnv::Singleton(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } From 38151ed05da4d51ef49ec62f209b1bf7b22f3927 Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Fri, 28 Jul 2023 16:12:31 +0800 Subject: [PATCH 02/29] upgrate photon to version 0.6 --- CMakeLists.txt | 2 +- env/io_posix.cc | 6 +++--- photon-bench.md | 13 +++---------- 3 files changed, 7 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 17706d3a7db..fc5b323d408 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,7 +49,7 @@ set(FETCHCONTENT_QUIET false) FetchContent_Declare( photon GIT_REPOSITORY https://github.com/alibaba/PhotonLibOS.git - GIT_TAG v0.5.3 + GIT_TAG v0.6.3 ) set(ENABLE_URING ON CACHE INTERNAL "Enable iouring") FetchContent_MakeAvailable(photon) diff --git a/env/io_posix.cc b/env/io_posix.cc index 2fdada93358..29dfc2989e7 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -201,7 +201,7 @@ Status PosixSequentialFile::PositionedRead(uint64_t offset, size_t n, size_t left = n; char* ptr = scratch; while (left > 0) { - r = photon::iouring_pread(fd_, ptr, left, offset, -1); + r = photon::iouring_pread(fd_, ptr, left, offset, 0, -1); if (r <= 0) { break; } @@ -332,7 +332,7 @@ Status PosixRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result, size_t left = n; char* ptr = scratch; while (left > 0) { - r = photon::iouring_pread(fd_, ptr, left, offset, -1); + r = photon::iouring_pread(fd_, ptr, left, offset, 0, -1); if (r <= 0) { break; } @@ -998,7 +998,7 @@ Status PosixRandomRWFile::Read(uint64_t offset, size_t n, Slice* result, size_t left = n; char* ptr = scratch; while (left > 0) { - ssize_t done = photon::iouring_pread(fd_, ptr, left, offset, -1); + ssize_t done = photon::iouring_pread(fd_, ptr, left, offset, 0, -1); if (done < 0) { // error while reading from file return IOError("While reading random read/write file offset " + diff --git a/photon-bench.md b/photon-bench.md index 15f80df5a34..c52eda77651 100644 --- a/photon-bench.md +++ b/photon-bench.md @@ -1,23 +1,16 @@ -## Build +## Run db_bench ```bash -# Auto convert code -./photon-auto-convert.sh - -# Compile -cmake -B build -D FAIL_ON_WARNINGS=off -D WITH_LZ4=on -D CMAKE_BUILD_TYPE=Release -cmake --build build -t db_bench -j - -# Run benchmark cd build cp ../tools/benchmark.sh . export DB_DIR=`pwd`/test-db export WAL_DIR=$DB_DIR export OUTPUT_DIR=$DB_DIR export COMPRESSION_TYPE=lz4 +export NUM_THREADS=64 export NUM_KEYS=100000000 # Require 14 GB disk space -# Edit benchmark.sh, add `taskset -c 1,8` before the ./db_bench command. +# For large number of threads, you may edit benchmark.sh, and add `taskset -c 1-8` before the ./db_bench command. # This would limit the CPU number for both thread and coroutine, in order to make a fair comparison. # Clean page cache before every test From f630466ea26bc5357ac5ac48acbc421ee805d891 Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Sun, 6 Aug 2023 17:27:40 +0800 Subject: [PATCH 03/29] Fix io_uring of io_posix.cc --- env/env_posix.cc | 2 +- env/io_posix.cc | 16 ++++++---------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/env/env_posix.cc b/env/env_posix.cc index d55ce71284a..7c01d99a478 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -255,7 +255,7 @@ class PosixEnv : public Env { result->reset(); Status s; int fd = -1; - int flags = (reopen) ? (O_CREAT | O_APPEND) : (O_CREAT | O_TRUNC); + int flags = (reopen) ? (O_CREAT | O_APPEND) : (O_CREAT | O_TRUNC | O_APPEND); // Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX) if (options.use_direct_writes && !options.use_mmap_writes) { // Note: we should avoid O_APPEND here due to ta the following bug: diff --git a/env/io_posix.cc b/env/io_posix.cc index 29dfc2989e7..75756625320 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -201,7 +201,7 @@ Status PosixSequentialFile::PositionedRead(uint64_t offset, size_t n, size_t left = n; char* ptr = scratch; while (left > 0) { - r = photon::iouring_pread(fd_, ptr, left, offset, 0, -1); + r = photon::iouring_pread(fd_, ptr, left, (off_t)offset); if (r <= 0) { break; } @@ -332,7 +332,7 @@ Status PosixRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result, size_t left = n; char* ptr = scratch; while (left > 0) { - r = photon::iouring_pread(fd_, ptr, left, offset, 0, -1); + r = photon::iouring_pread(fd_, ptr, left, offset); if (r <= 0) { break; } @@ -754,12 +754,8 @@ Status PosixWritableFile::Append(const Slice& data) { const char* src = data.data(); size_t left = data.size(); while (left != 0) { - ssize_t done = write(fd_, src, left); - std::this_thread::yield(); + ssize_t done = photon::iouring_pwrite(fd_, src, left, -1); if (done < 0) { - if (errno == EINTR) { - continue; - } return IOError("While appending to file", filename_, errno); } left -= done; @@ -779,7 +775,7 @@ Status PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset) { const char* src = data.data(); size_t left = data.size(); while (left != 0) { - ssize_t done = photon::iouring_pwrite(fd_, src, left, offset, -1);; + ssize_t done = photon::iouring_pwrite(fd_, src, left, (off_t)offset);; if (done < 0) { return IOError("While pwrite to file at offset " + ToString(offset), filename_, errno); @@ -976,7 +972,7 @@ Status PosixRandomRWFile::Write(uint64_t offset, const Slice& data) { const char* src = data.data(); size_t left = data.size(); while (left != 0) { - ssize_t done = photon::iouring_pwrite(fd_, src, left, offset, -1); + ssize_t done = photon::iouring_pwrite(fd_, src, left, (off_t)offset); if (done < 0) { // error while writing to file return IOError( @@ -998,7 +994,7 @@ Status PosixRandomRWFile::Read(uint64_t offset, size_t n, Slice* result, size_t left = n; char* ptr = scratch; while (left > 0) { - ssize_t done = photon::iouring_pread(fd_, ptr, left, offset, 0, -1); + ssize_t done = photon::iouring_pread(fd_, ptr, left, (off_t)offset); if (done < 0) { // error while reading from file return IOError("While reading random read/write file offset " + From b63a65452aa15f9b06dda0f482d292788a4a21d4 Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Thu, 10 Aug 2023 15:30:37 +0800 Subject: [PATCH 04/29] disable BUILD_TESTING option in photon --- CMakeLists.txt | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fc5b323d408..34584fd6b20 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,14 +46,21 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/") include(FetchContent) set(FETCHCONTENT_QUIET false) -FetchContent_Declare( - photon - GIT_REPOSITORY https://github.com/alibaba/PhotonLibOS.git - GIT_TAG v0.6.3 -) -set(ENABLE_URING ON CACHE INTERNAL "Enable iouring") -FetchContent_MakeAvailable(photon) + +function(do_fetch_photon) + set(BUILD_TESTING OFF) # cmake variable has its own copy inside a function (scope) + set(ENABLE_URING ON CACHE INTERNAL "Enable iouring") + FetchContent_Declare( + photon + GIT_REPOSITORY https://github.com/alibaba/PhotonLibOS.git + GIT_TAG v0.6.3 + ) + FetchContent_MakeAvailable(photon) +endfunction() + +do_fetch_photon() set(PHOTON_INCLUDE_DIR ${photon_SOURCE_DIR}/include/) +message(WARNING ${PHOTON_INCLUDE_DIR}) option(INIT_PHOTON_IN_ROCKSDB "INIT PHOTON IN ROCKSDB" OFF) if(INIT_PHOTON_IN_ROCKSDB) From 86a415e57c6b05adb324165eed9e9d5452141aa4 Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Fri, 11 Aug 2023 14:34:45 +0800 Subject: [PATCH 05/29] Temporarily use 0.4 backport --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 34584fd6b20..570a43034ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,8 +52,8 @@ function(do_fetch_photon) set(ENABLE_URING ON CACHE INTERNAL "Enable iouring") FetchContent_Declare( photon - GIT_REPOSITORY https://github.com/alibaba/PhotonLibOS.git - GIT_TAG v0.6.3 + GIT_REPOSITORY https://github.com/beef9999/PhotonLibOS.git + GIT_TAG backport-0.4 ) FetchContent_MakeAvailable(photon) endfunction() From 4cf7bfed0dccae88654ffd6040800196505f5efd Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Mon, 21 Aug 2023 11:09:16 +0800 Subject: [PATCH 06/29] update to release/0.4 --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 570a43034ba..509a5ff3674 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,8 +52,8 @@ function(do_fetch_photon) set(ENABLE_URING ON CACHE INTERNAL "Enable iouring") FetchContent_Declare( photon - GIT_REPOSITORY https://github.com/beef9999/PhotonLibOS.git - GIT_TAG backport-0.4 + GIT_REPOSITORY https://github.com/alibaba/PhotonLibOS.git + GIT_TAG release/0.4 ) FetchContent_MakeAvailable(photon) endfunction() From aef957e47932c2ea29917c41c4868b4c1f10f24b Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Fri, 20 Sep 2024 18:14:26 +0800 Subject: [PATCH 07/29] update photon to 0.8 --- CMakeLists.txt | 24 +- db/db_write_test.cc | 2 +- examples/CMakeLists.txt | 4 +- examples/perf/perf-client.cpp | 143 +++++++----- examples/perf/perf-server.cpp | 420 +++++++++++++--------------------- examples/perf/protocol.h | 26 ++- photon-bench.md | 8 +- photon.md | 21 +- 8 files changed, 307 insertions(+), 341 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 509a5ff3674..733a7a17527 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,22 +45,14 @@ endif() list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/") include(FetchContent) -set(FETCHCONTENT_QUIET false) - -function(do_fetch_photon) - set(BUILD_TESTING OFF) # cmake variable has its own copy inside a function (scope) - set(ENABLE_URING ON CACHE INTERNAL "Enable iouring") - FetchContent_Declare( - photon - GIT_REPOSITORY https://github.com/alibaba/PhotonLibOS.git - GIT_TAG release/0.4 - ) - FetchContent_MakeAvailable(photon) -endfunction() - -do_fetch_photon() +set(PHOTON_ENABLE_URING ON CACHE INTERNAL "Enable iouring") +FetchContent_Declare( + photon + GIT_REPOSITORY https://github.com/alibaba/PhotonLibOS.git + GIT_TAG release/0.8 +) +FetchContent_MakeAvailable(photon) set(PHOTON_INCLUDE_DIR ${photon_SOURCE_DIR}/include/) -message(WARNING ${PHOTON_INCLUDE_DIR}) option(INIT_PHOTON_IN_ROCKSDB "INIT PHOTON IN ROCKSDB" OFF) if(INIT_PHOTON_IN_ROCKSDB) @@ -766,7 +758,7 @@ else() add_library(${ROCKSDB_SHARED_LIB} SHARED ${SOURCES}) target_link_libraries(${ROCKSDB_SHARED_LIB} - ${THIRDPARTY_LIBS} ${SYSTEM_LIBS} -Wl,--whole-archive $ -Wl,--no-whole-archive) + ${THIRDPARTY_LIBS} ${SYSTEM_LIBS} $) set_target_properties(${ROCKSDB_SHARED_LIB} PROPERTIES LINKER_LANGUAGE CXX VERSION ${ROCKSDB_VERSION} diff --git a/db/db_write_test.cc b/db/db_write_test.cc index 69b977ac733..110bd794212 100644 --- a/db/db_write_test.cc +++ b/db/db_write_test.cc @@ -61,7 +61,7 @@ TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) { leader_count++; while (ready_count < kNumThreads) { // busy waiting - photon::thread_yield(); + std::this_thread::yield(); } } }); diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 9248c920d9f..6383fd9c964 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,7 +1,7 @@ add_executable(perf-client perf/perf-client.cpp) -target_include_directories(perf-client PRIVATE ${PROJECT_SOURCE_DIR}/include ${PHOTON_INCLUDE_DIR}) +target_include_directories(perf-client PRIVATE ${PROJECT_SOURCE_DIR}/include) target_link_libraries(perf-client ${ROCKSDB_STATIC_LIB}) add_executable(perf-server perf/perf-server.cpp) -target_include_directories(perf-server PRIVATE ${PROJECT_SOURCE_DIR}/include ${PHOTON_INCLUDE_DIR}) +target_include_directories(perf-server PRIVATE ${PROJECT_SOURCE_DIR}/include) target_link_libraries(perf-server ${ROCKSDB_STATIC_LIB}) \ No newline at end of file diff --git a/examples/perf/perf-client.cpp b/examples/perf/perf-client.cpp index e84c5ede914..628752ba6d1 100644 --- a/examples/perf/perf-client.cpp +++ b/examples/perf/perf-client.cpp @@ -10,70 +10,109 @@ DEFINE_int32(port, 9527, "server port"); DEFINE_string(host, "127.0.0.1", "server ip"); -DEFINE_string(type, "fill", "fill/read/write"); +DEFINE_string(type, "fill", "fill/get/put"); +DEFINE_int32(concurrency, 32, "concurrency"); +DEFINE_int32(key_num, 100'000, "key num"); +DEFINE_int32(value_size, 256 * 1024, "value size"); -constexpr int CONCURRENCY = 32; -constexpr int MAX_KEY_NUM = 10'000; +static std::string random_value(size_t size) { + static std::random_device rd; + static thread_local std::mt19937 gen(rd()); + static const char alphabet[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; + std::string s; + s.resize(size); + for (size_t i = 0; i < size; ++i) { + s[i] = alphabet[gen() % (sizeof(alphabet) - 1)]; + } + return s; +} -int gen_random_key() { - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_int_distribution dist(0, MAX_KEY_NUM - 1); - return dist(gen); +static std::string random_key() { + static std::random_device rd; + static thread_local std::mt19937 gen(rd()); + return std::to_string(gen() % FLAGS_key_num); } -void run_perf(photon::net::EndPoint ep, photon::rpc::StubPool* pool) { - int ret; - auto stub = pool->get_stub(ep, false); - DEFER(pool->put_stub(ep, ret < 0)); - - while (true) { - Echo::Request req; - std::string key = std::to_string(gen_random_key()); - req.key.assign(key); - req.write = FLAGS_type == "read" ? false : true; - - Echo::Response resp; - ret = stub->call(req, resp); - if (ret < 0 || resp.ret != 0) abort(); - } +void run_put(photon::net::EndPoint ep, photon::rpc::StubPool* pool) { + int ret; + auto stub = pool->get_stub(ep, false); + DEFER(pool->put_stub(ep, ret < 0)); + + while (true) { + KvPut::Request req; + auto key = random_key(); + req.key.assign(key); + auto val = random_value(FLAGS_value_size); + req.value.assign(val); + + KvPut::Response resp; + ret = stub->call(req, resp); + if (ret < 0 || resp.ret != 0) abort(); + } +} + +void run_get(photon::net::EndPoint ep, photon::rpc::StubPool* pool) { + int ret; + auto stub = pool->get_stub(ep, false); + DEFER(pool->put_stub(ep, ret < 0)); + + while (true) { + KvGet::Request req; + std::string key = random_key(); + req.key.assign(key); + + KvGet::Response resp; + ret = stub->call(req, resp); + if (ret < 0 || resp.ret != 0 || resp.value.size() != FLAGS_value_size) { + abort(); + } + } } void run_fill(photon::net::EndPoint ep, photon::rpc::StubPool* pool) { - int ret; - auto stub = pool->get_stub(ep, false); - DEFER(pool->put_stub(ep, ret < 0)); - - for (int i = 0; i < MAX_KEY_NUM; ++i) { - Echo::Request req; - std::string key = std::to_string(i); - req.key.assign(key); - req.write = FLAGS_type == "read" ? false : true; - - Echo::Response resp; - ret = stub->call(req, resp); - if (ret < 0 || resp.ret != 0) abort(); - } + int ret; + auto stub = pool->get_stub(ep, false); + DEFER(pool->put_stub(ep, ret < 0)); + + for (int i = 0; i < FLAGS_key_num; ++i) { + KvPut::Request req; + auto key = random_key(); + req.key.assign(key); + auto val = random_value(FLAGS_value_size); + req.value.assign(val); + + KvPut::Response resp; + ret = stub->call(req, resp); + if (ret < 0 || resp.ret != 0) { + abort(); + } + } } int main(int argc, char** argv) { - gflags::ParseCommandLineFlags(&argc, &argv, true); - - if (photon::init(photon::INIT_EVENT_IOURING, photon::INIT_IO_NONE)) - LOG_ERROR_RETURN(0, -1, "fail to init photon"); - DEFER(photon::fini()); + gflags::ParseCommandLineFlags(&argc, &argv, true); + set_log_output_level(ALOG_INFO); + if (photon::init(photon::INIT_EVENT_IOURING, photon::INIT_IO_NONE)) { + LOG_ERROR_RETURN(0, -1, "fail to init photon"); + } + DEFER(photon::fini()); - auto ep = photon::net::EndPoint(photon::net::IPAddr(FLAGS_host.c_str()), - FLAGS_port); + auto ep = photon::net::EndPoint(photon::net::IPAddr(FLAGS_host.c_str()), + FLAGS_port); - auto pool = photon::rpc::new_stub_pool(-1, -1, -1); + auto pool = photon::rpc::new_stub_pool(-1, -1, -1); - if (FLAGS_type == "fill") { - run_fill(ep, pool); - } else { - for (int i = 0; i < CONCURRENCY; ++i) { - photon::thread_create11(run_perf, ep, pool); + if (FLAGS_type == "fill") { + run_fill(ep, pool); + } else if (FLAGS_type == "put") { + for (int i = 0; i < FLAGS_concurrency; ++i) { + photon::thread_create11(run_put, ep, pool); + } + photon::thread_sleep(-1); + } else { + for (int i = 0; i < FLAGS_concurrency; ++i) { + photon::thread_create11(run_get, ep, pool); + } + photon::thread_sleep(-1); } - photon::thread_sleep(-1); - } } \ No newline at end of file diff --git a/examples/perf/perf-server.cpp b/examples/perf/perf-server.cpp index 30778baaf30..4c6ea655f32 100644 --- a/examples/perf/perf-server.cpp +++ b/examples/perf/perf-server.cpp @@ -1,3 +1,7 @@ +// Photon版本RocksDB server,在接收RPC的vCPU直接查询DB +// 原生多线程版本RocksDB server,使用WorkPool派发任务到多线程 + +#include #include #include #include @@ -5,6 +9,7 @@ #include #include +#include #include #include #include @@ -15,294 +20,193 @@ #include #include #include -#include #include "protocol.h" DEFINE_int32(port, 9527, "Server listen port"); -DEFINE_int32(show_qps_interval, 10, "interval seconds to show qps"); -DEFINE_int32(vcpu_num, 8, "vcpu number"); -DEFINE_bool(create_new_db, false, "create new db"); +DEFINE_int32(show_qps_interval, 1, "Interval seconds to show qps"); +DEFINE_int32(vcpu_num, 8, "vCPU number"); +DEFINE_bool(use_photon, false, "Use photon rocksdb instead of the native"); +DEFINE_string(db_dir, "perf-db", "DB dir"); +DEFINE_bool(clean_db, false, "Clean db before tests"); static std::atomic qps{0}; static void show_qps_loop() { - while (true) { - photon::thread_sleep(FLAGS_show_qps_interval); - LOG_INFO("QPS: `", qps.load() / FLAGS_show_qps_interval); - qps = 0; - } + while (true) { + photon::thread_sleep(FLAGS_show_qps_interval); + LOG_INFO("QPS: `", qps.load() / FLAGS_show_qps_interval); + qps = 0; + } } -class ExampleServer { - public: - // 协程池对性能影响巨大,如果这里将thread_pool_size降为0,即关闭协程池,则性能变为原先1/3 ~ 1/2 - explicit ExampleServer(int db_num = 1, int thread_pool_size = 65536) - : skeleton(photon::rpc::new_skeleton(true, thread_pool_size)), - server(photon::net::new_tcp_socket_server()), - m_db_num(db_num) { - skeleton->register_service(this); - writeOptions.sync = true; - db_sharding.resize(db_num); - LOG_INFO(VALUE(m_db_num)); - } - - virtual int do_rpc_service(Echo::Request* req, Echo::Response* resp, - IOVector*, IStream*) { - photon_std::this_thread::migrate(); - rocksdb::Status s; - std::string val; - rocksdb::DB* db = db_sharding[std::stoi(req->key.to_std()) % m_db_num]; - if (req->write) { - s = db->Put(writeOptions, - rocksdb::Slice(req->key.c_str(), req->key.size()), "1"); - } else { - s = db->Get(readOptions, - rocksdb::Slice(req->key.c_str(), req->key.size()), &val); - if (val != "1") { - LOG_ERROR("read value error"); - abort(); - } - } - if (!s.ok()) { - LOG_ERROR("db error"); - abort(); +class IOHandler { +public: + IOHandler(rocksdb::DB* db, rocksdb::WriteOptions* writeOptions, + rocksdb::ReadOptions* readOptions, photon::WorkPool* work_pool) : + skeleton_(photon::rpc::new_skeleton(65536U)), + socket_server_(photon::net::new_tcp_socket_server()), + db_(db), + writeOptions_(writeOptions), + readOptions_(readOptions), + work_pool_(work_pool) { + skeleton_->register_service(this); } - resp->ret = 0; - qps++; - return 0; - } - - int serve(photon::net::ISocketStream* stream) { - return skeleton->serve(stream, false); - } - - int run(int port) { - // Optimize RocksDB. This is the easiest way to get RocksDB to perform well - options.IncreaseParallelism(); - options.OptimizeLevelStyleCompaction(); - - options.stats_dump_period_sec = 0; - options.stats_persist_period_sec = 0; - options.enable_pipelined_write = true; - options.compression = rocksdb::CompressionType::kLZ4Compression; - // create the DB if it's not already present - options.create_if_missing = true; - if (open_db()) return -1; - - server->set_handler({this, &ExampleServer::serve}); - server->setsockopt(SOL_SOCKET, SO_REUSEPORT, 1); - if (server->bind(port) < 0) - LOG_ERRNO_RETURN(0, -1, "Failed to bind port `", port) - if (server->listen() < 0) LOG_ERRNO_RETURN(0, -1, "Failed to listen"); - LOG_INFO("Started rpc server at `", server->getsockname()); - return server->start_loop(true); - } - - protected: - static constexpr const char* db_dir = "perf-db"; - - std::unique_ptr skeleton{}; - std::unique_ptr server{}; - std::vector db_sharding{}; // 在一个server里open多个db - rocksdb::Options options; - rocksdb::WriteOptions writeOptions; - rocksdb::ReadOptions readOptions; - int m_db_num; - - virtual int open_db() { - for (int i = 0; i < m_db_num; ++i) { - if (open_db_at_index(i)) - abort(); + int serve(photon::net::ISocketStream* stream) { + return skeleton_->serve(stream); } - return 0; - } - virtual int open_db_at_index(int index) { - std::string path = std::string(get_current_dir_name()) + "/" + - std::string(db_dir) + "-" + std::to_string(index); - if (FLAGS_create_new_db) { - system((std::string("rm -rf ") + path).c_str()); - LOG_INFO("Create new db at `", path.c_str()); - } else { - LOG_INFO("Open db at `", path.c_str()); - } - rocksdb::Status s = rocksdb::DB::Open(options, path, &db_sharding[index]); - if (!s.ok()) { - LOG_ERROR_RETURN(0, -1, "open db ` failed:`", index, s.ToString().c_str()); + int run() { + socket_server_->set_handler({this, &IOHandler::serve}); + socket_server_->setsockopt(SOL_SOCKET, SO_REUSEPORT, 1); + if (socket_server_->bind(FLAGS_port) < 0) { + LOG_ERRNO_RETURN(0, -1, "Failed to bind port `", FLAGS_port) + } + if (socket_server_->listen() < 0) { + LOG_ERRNO_RETURN(0, -1, "Failed to listen"); + } + LOG_INFO("Started rpc server at `", socket_server_->getsockname()); + return socket_server_->start_loop(true); } - return 0; - } -}; - -class ExampleServerWithNativeRocksdb : public ExampleServer { - public: - // 同步线程模式下,线程数量需要设置大一点。可以用taskset限制程序的cpu数量等于协程的vcpu数 - explicit ExampleServerWithNativeRocksdb() - : pool(new photon::WorkPool(256, photon::INIT_EVENT_IOURING, 0)), - ExampleServer() { - - } - int do_rpc_service(Echo::Request* req, Echo::Response* resp, IOVector*, - IStream*) override { - // 使用work pool进行同步线程调用 - pool->call([&] { - rocksdb::Status s; - std::string val; - rocksdb::DB* db = db_sharding[std::stoi(req->key.to_std()) % m_db_num]; - if (req->write) { - s = db->Put(writeOptions, - rocksdb::Slice(req->key.c_str(), req->key.size()), "1"); - } else { - s = db->Get(readOptions, - rocksdb::Slice(req->key.c_str(), req->key.size()), &val); - if (val != "1") abort(); - } - }); - resp->ret = 0; - qps++; - return 0; - } - - private: - photon::WorkPool* pool; -}; -class MultiExampleServer : public ExampleServer { - public: - explicit MultiExampleServer(int index) - : m_index(index), ExampleServer() { - } + int do_rpc_service(KvPut::Request* req, KvPut::Response* resp, IOVector*, IStream*) { + if (FLAGS_use_photon) { + do_put(req); + } else { + photon::semaphore sem; + auto func = new auto([&]() { + do_put(req); + sem.signal(1); + }); + work_pool_->async_call(func); + sem.wait(1); + } + resp->ret = 0; + qps++; + return 0; + } - int do_rpc_service(Echo::Request* req, Echo::Response* resp, IOVector*, - IStream*) override { - rocksdb::Status s; - std::string val; - if (req->write) { - s = db_alone->Put(writeOptions, - rocksdb::Slice(req->key.c_str(), req->key.size()), "1"); - } else { - s = db_alone->Get(readOptions, - rocksdb::Slice(req->key.c_str(), req->key.size()), &val); - if (val != "1") abort(); + int do_rpc_service(KvGet::Request* req, KvGet::Response* resp, IOVector*, IStream*) { + std::string val; + if (FLAGS_use_photon) { + do_get(req, &val); + } else { + photon::semaphore sem; + auto func = new auto([&]() { + do_get(req, &val); + sem.signal(1); + }); + work_pool_->async_call(func); + sem.wait(1); + } + resp->ret = 0; + resp->value.assign(val); + qps++; + return 0; } - resp->ret = 0; - qps++; - return 0; - } - private: - int m_index; - rocksdb::DB* db_alone = nullptr; // 每个server配一个db +private: + std::unique_ptr skeleton_; + std::unique_ptr socket_server_; + rocksdb::DB* db_; // Owned by others + rocksdb::WriteOptions* writeOptions_; // Owned by others + rocksdb::ReadOptions* readOptions_; // Owned by others + photon::WorkPool* work_pool_; // Owned by others + + void do_put(KvPut::Request* req) { + rocksdb::Slice key(req->key.c_str(), req->key.size()); + rocksdb::Slice val(req->value.c_str(), req->value.size()); + rocksdb::Status s = db_->Put(*writeOptions_, key, val); + if (!s.ok()) { + LOG_ERROR("db write error"); + abort(); + } + } - int open_db() override { - std::string path = std::string(get_current_dir_name()) + "/" + - std::string(db_dir) + "-" + std::to_string(m_index); - system((std::string("rm -rf ") + path).c_str()); - LOG_INFO("Create new db at `", path.c_str()); - rocksdb::Status s = rocksdb::DB::Open(options, path, &db_alone); - if (!s.ok()) { - LOG_ERROR_RETURN(0, -1, "open db failed"); + void do_get(KvGet::Request* req, std::string* val) { + rocksdb::Slice key(req->key.c_str(), req->key.size()); + rocksdb::Status s = db_->Get(*readOptions_, key, val); + if (!s.ok()) { + LOG_ERROR("db read error"); + abort(); + } } - return 0; - } }; -class MultiDBExampleServer : public ExampleServer { - public: - explicit MultiDBExampleServer(int db_num) : ExampleServer(db_num) {} - - int do_rpc_service(Echo::Request* req, Echo::Response* resp, IOVector*, - IStream*) override { - rocksdb::Status s; - std::string val; - size_t index = std::stoi(req->key.to_std()) % m_db_num; - - // TODO: modify photon - // photon_std::this_thread::migrate(index); - - rocksdb::DB* db = db_sharding[index]; - if (req->write) { - s = db->Put(writeOptions, - rocksdb::Slice(req->key.c_str(), req->key.size()), "1"); - } else { - s = db->Get(readOptions, - rocksdb::Slice(req->key.c_str(), req->key.size()), &val); - if (val != "1") { - LOG_ERROR("read value error"); - abort(); - } - } - if (!s.ok()) { - LOG_ERROR("db error"); - abort(); +class ExampleServer { +public: + ExampleServer() { + writeOptions.sync = false; + pool = new photon::WorkPool(FLAGS_vcpu_num, photon::INIT_EVENT_IOURING, 0); } - resp->ret = 0; - qps++; - return 0; - } - private: - int open_db() override { - for (int i = 0; i < m_db_num; ++i) { - photon::thread_create11(&MultiDBExampleServer::open_db_at_index, this, i); + int run() { + // Optimize RocksDB. This is the easiest way to get RocksDB to perform well + options.IncreaseParallelism(); + options.OptimizeLevelStyleCompaction(); + + options.stats_dump_period_sec = 0; + options.stats_persist_period_sec = 0; + options.enable_pipelined_write = true; + options.compression = rocksdb::CompressionType::kNoCompression; + // create the DB if it's not already present + options.create_if_missing = true; + + if (open_db()) { + return -1; + } + + for (int i = 0; i < FLAGS_vcpu_num; ++i) { + std::thread([&] { + int ret = photon::init(photon::INIT_EVENT_IOURING, photon::INIT_IO_NONE); + if (ret) { + abort(); + } + DEFER(photon::fini()); + IOHandler handler(db, &writeOptions, &readOptions, pool); + handler.run(); + }).detach(); + } + return 0; } - return 0; - } - int open_db_at_index(int index) override { - // TODO modify photon - // photon_std::this_thread::migrate(index); - LOG_INFO("Open db ` in vcpu `", index, photon::get_vcpu()); - return ExampleServer::open_db_at_index(index); - } +private: + rocksdb::DB* db = nullptr; + rocksdb::Options options; + rocksdb::WriteOptions writeOptions; + rocksdb::ReadOptions readOptions; + photon::WorkPool* pool = nullptr; + + int open_db() { + auto path = std::string(get_current_dir_name()) + "/" + FLAGS_db_dir; + if (FLAGS_clean_db) { + system((std::string("rm -rf ") + path).c_str()); + LOG_INFO("Create new db at `", path.c_str()); + } else { + LOG_INFO("Open db at `", path.c_str()); + } + rocksdb::Status s = rocksdb::DB::Open(options, path, &db); + if (!s.ok()) { + LOG_ERROR_RETURN(0, -1, "open db failed:`", s.ToString()); + } + return 0; + } }; -// 单server,用thread_migrate迁移到多vcpu -void test_single_server() { - photon_std::work_pool_init(FLAGS_vcpu_num, photon::INIT_EVENT_IOURING, 0); - auto server = new ExampleServer(); - server->run(FLAGS_port); -} - -// 单server,原生多线程版本db -void test_single_server_with_native_rocksdb() { - auto server = new ExampleServerWithNativeRocksdb(); - server->run(FLAGS_port); -} - -// 多server监听同一端口,让内核来分发连接,每个vcpu有一个server,每个server一个db实例 -// 需要修改std-compat.h,让rocksdb内部的thread不会自动迁移 -void test_multiple_servers() { - for (int i = 0; i < FLAGS_vcpu_num; ++i) { - new std::thread([i] { - photon::init(photon::INIT_EVENT_IOURING, 0); - auto server = new MultiExampleServer(i); - server->run(FLAGS_port); - photon::thread_sleep(-1); - }); - } - photon::thread_sleep(-1); -} - -// 一个server,open多个db。每个db只处理自己vcpu上的读请求,不跨vcpu -void test_multi_db_server() { - photon_std::work_pool_init(FLAGS_vcpu_num, photon::INIT_EVENT_IOURING, 0); - auto server = new MultiDBExampleServer(FLAGS_vcpu_num); - server->run(FLAGS_port); -} - int main(int argc, char** argv) { - gflags::ParseCommandLineFlags(&argc, &argv, true); - if (photon::init(photon::INIT_EVENT_IOURING, photon::INIT_IO_NONE)) - LOG_ERROR_RETURN(0, -1, "fail to init photon"); - DEFER(photon::fini()); + gflags::ParseCommandLineFlags(&argc, &argv, true); + set_log_output_level(ALOG_INFO); + if (photon::init(photon::INIT_EVENT_IOURING, photon::INIT_IO_NONE)) { + LOG_ERROR_RETURN(0, -1, "fail to init photon"); + } + DEFER(photon::fini()); - photon::thread_create11(show_qps_loop); + photon::thread_create11(show_qps_loop); - test_single_server(); - // test_single_server_with_native_rocksdb(); - // test_multiple_servers(); - // test_multi_db_server(); + auto server = new ExampleServer(); + if (server->run()) { + return -1; + } + photon::thread_sleep(-1UL); } diff --git a/examples/perf/protocol.h b/examples/perf/protocol.h index 1cdd964873d..b31c123295d 100644 --- a/examples/perf/protocol.h +++ b/examples/perf/protocol.h @@ -4,15 +4,15 @@ #include -struct Echo { +struct KvPut { const static uint32_t IID = 1; - const static uint32_t FID = 2; + const static uint32_t FID = 1; struct Request : public photon::rpc::Message { photon::rpc::string key; - bool write; + photon::rpc::string value; - PROCESS_FIELDS(key, write); + PROCESS_FIELDS(key, value); }; struct Response : public photon::rpc::Message { @@ -20,4 +20,22 @@ struct Echo { PROCESS_FIELDS(ret); }; +}; + +struct KvGet { + const static uint32_t IID = 1; + const static uint32_t FID = 2; + + struct Request : public photon::rpc::Message { + photon::rpc::string key; + + PROCESS_FIELDS(key); + }; + + struct Response : public photon::rpc::Message { + int32_t ret; + photon::rpc::string value; + + PROCESS_FIELDS(ret, value); + }; }; \ No newline at end of file diff --git a/photon-bench.md b/photon-bench.md index c52eda77651..67ea3b4651a 100644 --- a/photon-bench.md +++ b/photon-bench.md @@ -6,9 +6,11 @@ cp ../tools/benchmark.sh . export DB_DIR=`pwd`/test-db export WAL_DIR=$DB_DIR export OUTPUT_DIR=$DB_DIR -export COMPRESSION_TYPE=lz4 -export NUM_THREADS=64 -export NUM_KEYS=100000000 # Require 14 GB disk space +export COMPRESSION_TYPE=none +export NUM_THREADS=16 +export KEY_SIZE=75 +export NUM_KEYS=10000000 # Require 10 GB disk space +export VALUE_SIZE=1024 # For large number of threads, you may edit benchmark.sh, and add `taskset -c 1-8` before the ./db_bench command. # This would limit the CPU number for both thread and coroutine, in order to make a fair comparison. diff --git a/photon.md b/photon.md index 7b5857b4d86..03f709b94b8 100644 --- a/photon.md +++ b/photon.md @@ -1,13 +1,24 @@ ## Build ```bash -# Build performance test +# Build performance test on RPC client/server (Photon RocksDB) ./photon-auto-convert.sh cmake -B build -D INIT_PHOTON_IN_ROCKSDB=off -D FAIL_ON_WARNINGS=off -D WITH_LZ4=on -D WITH_SNAPPY=on -D CMAKE_BUILD_TYPE=Release -cmake --build build -t perf-client -t perf-server -j +cmake --build build -t perf-client -t perf-server -j 32 -# Build CI tests and db_bench +# Build performance test on RPC client/server (Native RocksDB) +git checkout 6.1.2 +git checkout photon-on-6.1.2 -- examples/ CMakeLists.txt +cmake -B build -D INIT_PHOTON_IN_ROCKSDB=off -D FAIL_ON_WARNINGS=off -D WITH_LZ4=on -D WITH_SNAPPY=on -D CMAKE_BUILD_TYPE=Release +cmake --build build -t perf-client -t perf-server -j 32 + +# Build db_bench ./photon-auto-convert.sh cmake -B build -D INIT_PHOTON_IN_ROCKSDB=on -D FAIL_ON_WARNINGS=off -D WITH_LZ4=on -D WITH_SNAPPY=on -D CMAKE_BUILD_TYPE=Release -cmake --build build -t db_bench -j -``` +cmake --build build -t db_bench -j 32 + +# Build CI tests +./photon-auto-convert.sh +cmake -B build -D WITH_TESTS=on -D INIT_PHOTON_IN_ROCKSDB=on -D FAIL_ON_WARNINGS=off -D WITH_LZ4=on -D WITH_SNAPPY=on -D CMAKE_BUILD_TYPE=Debug +cmake --build build -j 32 +``` \ No newline at end of file From 7e5773c7e847baed6a7251d024a58077ce48876a Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Sun, 22 Sep 2024 14:38:46 +0800 Subject: [PATCH 08/29] minor change --- env/env_posix.cc | 16 +++++++++------- include/rocksdb/env.h | 26 ++++++++++++++------------ 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/env/env_posix.cc b/env/env_posix.cc index 7c01d99a478..d63ac4da5f7 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -1092,24 +1092,26 @@ std::string Env::GenerateUniqueId() { return uuid2; } -PhotonEnv::PhotonEnv() { - int ret = photon::init(photon::INIT_EVENT_IOURING, photon::INIT_IO_NONE); +PhotonEnv::PhotonEnv(int vcpu_num, int ev_engine) { + LOG_INFO("Begin init Photon Env"); + int ret = photon::init(ev_engine, photon::INIT_IO_NONE); if (ret != 0) { - LOG_FATAL("photon init failed"); + LOG_FATAL("Photon init failed"); abort(); } - // Max 8 vcpu. Hardcoded for now. - ret = photon_std::work_pool_init(8, photon::INIT_EVENT_IOURING, photon::INIT_IO_NONE); + ret = photon_std::work_pool_init(vcpu_num, ev_engine, photon::INIT_IO_NONE); if (ret != 0) { - LOG_FATAL("work pool init failed"); + LOG_FATAL("Work-pool init failed"); abort(); } + LOG_INFO("End init Photon Env"); } PhotonEnv::~PhotonEnv() { + LOG_INFO("Begin destruct Photon Env"); photon_std::work_pool_fini(); photon::fini(); - LOG_INFO("PhotonEnv finished"); + LOG_INFO("End destruct Photon Env"); } // diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index e79dcecbd9c..710a8378ff1 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -25,6 +25,7 @@ #include #include "rocksdb/status.h" #include "rocksdb/thread_status.h" +#include "port/port.h" #ifdef _WIN32 // Windows API macro interference @@ -1455,20 +1456,21 @@ Status NewHdfsEnv(Env** hdfs_env, const std::string& fsname); Env* NewTimedEnv(Env* base_env); class PhotonEnv { - public: - static PhotonEnv& Singleton() { - static PhotonEnv instance; - return instance; - } +public: + static PhotonEnv& Singleton() { + // 8 vCPU. Hardcoded for now. + static PhotonEnv instance(8, photon::INIT_EVENT_IOURING); + return instance; + } - PhotonEnv(PhotonEnv const&) = delete; - PhotonEnv(PhotonEnv&&) = delete; - PhotonEnv& operator=(PhotonEnv const&) = delete; - PhotonEnv& operator=(PhotonEnv&&) = delete; + PhotonEnv(PhotonEnv const&) = delete; + PhotonEnv(PhotonEnv&&) = delete; + PhotonEnv& operator=(PhotonEnv const&) = delete; + PhotonEnv& operator=(PhotonEnv&&) = delete; - private: - PhotonEnv(); - ~PhotonEnv(); +private: + PhotonEnv(int vcpu_num, int ev_engine); + ~PhotonEnv(); }; } // namespace rocksdb From b5abf899f5a8c80b92a44595a736a15238356b54 Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Sun, 22 Sep 2024 23:42:44 +0800 Subject: [PATCH 09/29] update --- env/env_posix.cc | 33 +++++++++++++++++---------------- include/rocksdb/env.h | 4 ++-- include/rocksdb/options.h | 2 +- photon-bench.md | 4 ++-- photon.md | 2 ++ 5 files changed, 24 insertions(+), 21 deletions(-) diff --git a/env/env_posix.cc b/env/env_posix.cc index d63ac4da5f7..ebcdd819644 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -1093,25 +1093,26 @@ std::string Env::GenerateUniqueId() { } PhotonEnv::PhotonEnv(int vcpu_num, int ev_engine) { - LOG_INFO("Begin init Photon Env"); - int ret = photon::init(ev_engine, photon::INIT_IO_NONE); - if (ret != 0) { - LOG_FATAL("Photon init failed"); - abort(); - } - ret = photon_std::work_pool_init(vcpu_num, ev_engine, photon::INIT_IO_NONE); - if (ret != 0) { - LOG_FATAL("Work-pool init failed"); - abort(); - } - LOG_INFO("End init Photon Env"); + LOG_INFO("Begin init Photon Env"); + set_log_output_level(ALOG_INFO); + int ret = photon::init(ev_engine, photon::INIT_IO_NONE); + if (ret != 0) { + LOG_FATAL("Photon init failed"); + abort(); + } + ret = photon_std::work_pool_init(vcpu_num, ev_engine, photon::INIT_IO_NONE); + if (ret != 0) { + LOG_FATAL("Work-pool init failed"); + abort(); + } + LOG_INFO("End init Photon Env"); } PhotonEnv::~PhotonEnv() { - LOG_INFO("Begin destruct Photon Env"); - photon_std::work_pool_fini(); - photon::fini(); - LOG_INFO("End destruct Photon Env"); + LOG_INFO("Begin destruct Photon Env"); + photon_std::work_pool_fini(); + photon::fini(); + LOG_INFO("End destruct Photon Env"); } // diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 710a8378ff1..6d6b3a74afa 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -1458,8 +1458,8 @@ Env* NewTimedEnv(Env* base_env); class PhotonEnv { public: static PhotonEnv& Singleton() { - // 8 vCPU. Hardcoded for now. - static PhotonEnv instance(8, photon::INIT_EVENT_IOURING); + // 16 vCPU. Hardcoded for now. + static PhotonEnv instance(16, photon::INIT_EVENT_IOURING); return instance; } diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index f7d6dfaf58d..1b1a1dc53c3 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -357,7 +357,7 @@ struct DBOptions { // `total_threads` is used. Good value for `total_threads` is the number of // cores. You almost definitely want to call this function if your system is // bottlenecked by RocksDB. - DBOptions* IncreaseParallelism(int total_threads = 16); + DBOptions* IncreaseParallelism(int total_threads = 128); #endif // ROCKSDB_LITE // If true, the database will be created if it is missing. diff --git a/photon-bench.md b/photon-bench.md index 67ea3b4651a..abcff40e9c8 100644 --- a/photon-bench.md +++ b/photon-bench.md @@ -1,4 +1,4 @@ -## Run db_bench +## Standalone db_bench tests ```bash cd build @@ -8,7 +8,7 @@ export WAL_DIR=$DB_DIR export OUTPUT_DIR=$DB_DIR export COMPRESSION_TYPE=none export NUM_THREADS=16 -export KEY_SIZE=75 +export KEY_SIZE=100 export NUM_KEYS=10000000 # Require 10 GB disk space export VALUE_SIZE=1024 diff --git a/photon.md b/photon.md index 03f709b94b8..7433cfee2a3 100644 --- a/photon.md +++ b/photon.md @@ -21,4 +21,6 @@ cmake --build build -t db_bench -j 32 ./photon-auto-convert.sh cmake -B build -D WITH_TESTS=on -D INIT_PHOTON_IN_ROCKSDB=on -D FAIL_ON_WARNINGS=off -D WITH_LZ4=on -D WITH_SNAPPY=on -D CMAKE_BUILD_TYPE=Debug cmake --build build -j 32 +ulimit -n 100000 +cd build && ctest . ``` \ No newline at end of file From 3cfd050745bcd68def78778369099d39e41b1905 Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Mon, 23 Sep 2024 23:51:09 +0800 Subject: [PATCH 10/29] fix gcc 13 compile --- db/range_del_aggregator.h | 1 + include/rocksdb/slice.h | 1 + util/string_util.h | 1 + 3 files changed, 3 insertions(+) diff --git a/db/range_del_aggregator.h b/db/range_del_aggregator.h index 712ae458390..469f58f41fa 100644 --- a/db/range_del_aggregator.h +++ b/db/range_del_aggregator.h @@ -12,6 +12,7 @@ #include #include #include +#include #include "db/compaction_iteration_stats.h" #include "db/dbformat.h" diff --git a/include/rocksdb/slice.h b/include/rocksdb/slice.h index 2b01e6d9a66..931f2731e16 100644 --- a/include/rocksdb/slice.h +++ b/include/rocksdb/slice.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #ifdef __cpp_lib_string_view diff --git a/util/string_util.h b/util/string_util.h index 6e125ddfa8f..36aad308567 100644 --- a/util/string_util.h +++ b/util/string_util.h @@ -10,6 +10,7 @@ #include #include #include +#include namespace rocksdb { From d08b4d19d23df5973477e01f8e59ab8cf4845382 Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Tue, 24 Sep 2024 00:40:35 +0800 Subject: [PATCH 11/29] update readme --- photon.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/photon.md b/photon.md index 7433cfee2a3..156a42c2aab 100644 --- a/photon.md +++ b/photon.md @@ -1,6 +1,9 @@ ## Build ```bash +# Install dependencies +apt install libgflags-dev libsnappy-dev zlib1g-dev libbz2-dev liblz4-dev libzstd-dev + # Build performance test on RPC client/server (Photon RocksDB) ./photon-auto-convert.sh cmake -B build -D INIT_PHOTON_IN_ROCKSDB=off -D FAIL_ON_WARNINGS=off -D WITH_LZ4=on -D WITH_SNAPPY=on -D CMAKE_BUILD_TYPE=Release From b50a1bfa1276d65a3ffb1f9579f9780d036e3473 Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Tue, 24 Sep 2024 11:06:56 +0800 Subject: [PATCH 12/29] add epoll/iouring option --- CMakeLists.txt | 6 ++++- env/io_posix.cc | 60 +++++++++++++++++++++++++++++++++++-------- include/rocksdb/env.h | 4 +++ photon.md | 8 +++--- 4 files changed, 63 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 733a7a17527..98aa3877efb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,7 +45,7 @@ endif() list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/") include(FetchContent) -set(PHOTON_ENABLE_URING ON CACHE INTERNAL "Enable iouring") +set(PHOTON_ENABLE_URING OFF CACHE INTERNAL "Enable iouring") FetchContent_Declare( photon GIT_REPOSITORY https://github.com/alibaba/PhotonLibOS.git @@ -54,6 +54,10 @@ FetchContent_Declare( FetchContent_MakeAvailable(photon) set(PHOTON_INCLUDE_DIR ${photon_SOURCE_DIR}/include/) +if (PHOTON_ENABLE_URING) + add_compile_definitions("PHOTON_ENABLE_URING") +endif () + option(INIT_PHOTON_IN_ROCKSDB "INIT PHOTON IN ROCKSDB" OFF) if(INIT_PHOTON_IN_ROCKSDB) add_definitions(-DINIT_PHOTON_IN_ROCKSDB) diff --git a/env/io_posix.cc b/env/io_posix.cc index 75756625320..123980d3740 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -40,6 +40,46 @@ #define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) #endif +#ifdef PHOTON_ENABLE_URING +ssize_t photon_read(int fd, void* buf, size_t count) { + return photon::iouring_pread(fd, buf, count, -1); +} +ssize_t photon_write(int fd, const void* buf, size_t count) { + return photon::iouring_pwrite(fd, buf, count, -1); +} +ssize_t photon_pread(int fd, void* buf, size_t count, off_t offset) { + return photon::iouring_pread(fd, buf, count, offset); +} +ssize_t photon_pwrite(int fd, const void* buf, size_t count, off_t offset) { + return photon::iouring_pwrite(fd, buf, count, offset); +} +int photon_fsync(int fd) { + return photon::iouring_fsync(fd); +} +int photon_fdatasync(int fd) { + return photon::iouring_fdatasync(fd); +} +#else +ssize_t photon_read(int fd, void* buf, size_t count) { + return read(fd, buf, count); +} +ssize_t photon_write(int fd, const void* buf, size_t count) { + return write(fd, buf, count); +} +ssize_t photon_pread(int fd, void* buf, size_t count, off_t offset) { + return pread(fd, buf, count, offset); +} +ssize_t photon_pwrite(int fd, const void* buf, size_t count, off_t offset) { + return pwrite(fd, buf, count, offset); +} +int photon_fsync(int fd) { + return fsync(fd); +} +int photon_fdatasync(int fd) { + return fdatasync(fd); +} +#endif + namespace rocksdb { // A wrapper for fadvise, if the platform doesn't support fadvise, @@ -201,7 +241,7 @@ Status PosixSequentialFile::PositionedRead(uint64_t offset, size_t n, size_t left = n; char* ptr = scratch; while (left > 0) { - r = photon::iouring_pread(fd_, ptr, left, (off_t)offset); + r = photon_pread(fd_, ptr, left, (off_t)offset); if (r <= 0) { break; } @@ -332,7 +372,7 @@ Status PosixRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result, size_t left = n; char* ptr = scratch; while (left > 0) { - r = photon::iouring_pread(fd_, ptr, left, offset); + r = photon_pread(fd_, ptr, left, offset); if (r <= 0) { break; } @@ -754,7 +794,7 @@ Status PosixWritableFile::Append(const Slice& data) { const char* src = data.data(); size_t left = data.size(); while (left != 0) { - ssize_t done = photon::iouring_pwrite(fd_, src, left, -1); + ssize_t done = photon_write(fd_, src, left); if (done < 0) { return IOError("While appending to file", filename_, errno); } @@ -775,7 +815,7 @@ Status PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset) { const char* src = data.data(); size_t left = data.size(); while (left != 0) { - ssize_t done = photon::iouring_pwrite(fd_, src, left, (off_t)offset);; + ssize_t done = photon_pwrite(fd_, src, left, (off_t)offset);; if (done < 0) { return IOError("While pwrite to file at offset " + ToString(offset), filename_, errno); @@ -858,14 +898,14 @@ Status PosixWritableFile::Close() { Status PosixWritableFile::Flush() { return Status::OK(); } Status PosixWritableFile::Sync() { - if (photon::iouring_fdatasync(fd_) < 0) { + if (photon_fdatasync(fd_) < 0) { return IOError("While fdatasync", filename_, errno); } return Status::OK(); } Status PosixWritableFile::Fsync() { - if (photon::iouring_fsync(fd_) < 0) { + if (photon_fsync(fd_) < 0) { return IOError("While fsync", filename_, errno); } return Status::OK(); @@ -972,7 +1012,7 @@ Status PosixRandomRWFile::Write(uint64_t offset, const Slice& data) { const char* src = data.data(); size_t left = data.size(); while (left != 0) { - ssize_t done = photon::iouring_pwrite(fd_, src, left, (off_t)offset); + ssize_t done = photon_pwrite(fd_, src, left, (off_t)offset); if (done < 0) { // error while writing to file return IOError( @@ -994,7 +1034,7 @@ Status PosixRandomRWFile::Read(uint64_t offset, size_t n, Slice* result, size_t left = n; char* ptr = scratch; while (left > 0) { - ssize_t done = photon::iouring_pread(fd_, ptr, left, (off_t)offset); + ssize_t done = photon_pread(fd_, ptr, left, (off_t)offset); if (done < 0) { // error while reading from file return IOError("While reading random read/write file offset " + @@ -1018,14 +1058,14 @@ Status PosixRandomRWFile::Read(uint64_t offset, size_t n, Slice* result, Status PosixRandomRWFile::Flush() { return Status::OK(); } Status PosixRandomRWFile::Sync() { - if (photon::iouring_fdatasync(fd_) < 0) { + if (photon_fdatasync(fd_) < 0) { return IOError("While fdatasync random read/write file", filename_, errno); } return Status::OK(); } Status PosixRandomRWFile::Fsync() { - if (photon::iouring_fsync(fd_) < 0) { + if (photon_fsync(fd_) < 0) { return IOError("While fsync random read/write file", filename_, errno); } return Status::OK(); diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 6d6b3a74afa..35f503fe2f1 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -1459,7 +1459,11 @@ class PhotonEnv { public: static PhotonEnv& Singleton() { // 16 vCPU. Hardcoded for now. +#ifdef PHOTON_ENABLE_URING static PhotonEnv instance(16, photon::INIT_EVENT_IOURING); +#else + static PhotonEnv instance(16, photon::INIT_EVENT_EPOLL); +#endif return instance; } diff --git a/photon.md b/photon.md index 156a42c2aab..7850bc26e10 100644 --- a/photon.md +++ b/photon.md @@ -7,23 +7,23 @@ apt install libgflags-dev libsnappy-dev zlib1g-dev libbz2-dev liblz4-dev libzstd # Build performance test on RPC client/server (Photon RocksDB) ./photon-auto-convert.sh cmake -B build -D INIT_PHOTON_IN_ROCKSDB=off -D FAIL_ON_WARNINGS=off -D WITH_LZ4=on -D WITH_SNAPPY=on -D CMAKE_BUILD_TYPE=Release -cmake --build build -t perf-client -t perf-server -j 32 +cmake --build build -t perf-client -t perf-server -j `nproc` # Build performance test on RPC client/server (Native RocksDB) git checkout 6.1.2 git checkout photon-on-6.1.2 -- examples/ CMakeLists.txt cmake -B build -D INIT_PHOTON_IN_ROCKSDB=off -D FAIL_ON_WARNINGS=off -D WITH_LZ4=on -D WITH_SNAPPY=on -D CMAKE_BUILD_TYPE=Release -cmake --build build -t perf-client -t perf-server -j 32 +cmake --build build -t perf-client -t perf-server -j `nproc` # Build db_bench ./photon-auto-convert.sh cmake -B build -D INIT_PHOTON_IN_ROCKSDB=on -D FAIL_ON_WARNINGS=off -D WITH_LZ4=on -D WITH_SNAPPY=on -D CMAKE_BUILD_TYPE=Release -cmake --build build -t db_bench -j 32 +cmake --build build -t db_bench -j `nproc` # Build CI tests ./photon-auto-convert.sh cmake -B build -D WITH_TESTS=on -D INIT_PHOTON_IN_ROCKSDB=on -D FAIL_ON_WARNINGS=off -D WITH_LZ4=on -D WITH_SNAPPY=on -D CMAKE_BUILD_TYPE=Debug -cmake --build build -j 32 +cmake --build build -j `nproc` ulimit -n 100000 cd build && ctest . ``` \ No newline at end of file From d83657880e6506ff2652b2b0636c6ca9107125a6 Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Tue, 24 Sep 2024 15:48:21 +0800 Subject: [PATCH 13/29] db_compaction_test --- db/db_compaction_test.cc | 1 - env/env_posix.cc | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index d6567071a3f..df51ef2ca2a 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -4349,7 +4349,6 @@ TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) { } // namespace rocksdb int main(int argc, char** argv) { - rocksdb::PhotonEnv::Singleton(); #if !defined(ROCKSDB_LITE) rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); diff --git a/env/env_posix.cc b/env/env_posix.cc index ebcdd819644..856490ce673 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -1014,6 +1014,7 @@ PosixEnv::PosixEnv() page_size_(getpagesize()), thread_pools_(Priority::TOTAL), allow_non_owner_access_(true) { + LOG_INFO("global PosixEnv construct: Create thread pools"); for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { thread_pools_[pool_id].SetThreadPriority( static_cast(pool_id)); From 6733e6b3d96faa9f5776913d8531a1ec506749d7 Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Tue, 24 Sep 2024 16:17:11 +0800 Subject: [PATCH 14/29] inlineskiplist_test --- memtable/inlineskiplist_test.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/memtable/inlineskiplist_test.cc b/memtable/inlineskiplist_test.cc index 651a97b82c9..b416ef7c557 100644 --- a/memtable/inlineskiplist_test.cc +++ b/memtable/inlineskiplist_test.cc @@ -640,7 +640,6 @@ TEST_F(InlineSkipTest, ConcurrentInsert3) { RunConcurrentInsert(3); } } // namespace rocksdb int main(int argc, char** argv) { - rocksdb::PhotonEnv::Singleton(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } From cb7a21b1bdc33a3b29d38419b644d31a3b9dce05 Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Tue, 24 Sep 2024 20:59:08 +0800 Subject: [PATCH 15/29] fix skiplist_test --- memtable/skiplist_test.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/memtable/skiplist_test.cc b/memtable/skiplist_test.cc index 1cfdcb411ef..50c3588bb86 100644 --- a/memtable/skiplist_test.cc +++ b/memtable/skiplist_test.cc @@ -383,7 +383,6 @@ TEST_F(SkipTest, Concurrent5) { RunConcurrent(5); } } // namespace rocksdb int main(int argc, char** argv) { - rocksdb::PhotonEnv::Singleton(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } From 8d097dab06b2d9d1c6e9fbcaff14b9f79bc3c203 Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Wed, 25 Sep 2024 15:44:54 +0800 Subject: [PATCH 16/29] fix cassandra_functional_test stringappend_test write_prepared_transaction_test --- utilities/cassandra/cassandra_functional_test.cc | 1 - utilities/merge_operators/string_append/stringappend_test.cc | 1 - utilities/transactions/write_prepared_transaction_test.cc | 1 - 3 files changed, 3 deletions(-) diff --git a/utilities/cassandra/cassandra_functional_test.cc b/utilities/cassandra/cassandra_functional_test.cc index d242da91de0..dacc6f03ce3 100644 --- a/utilities/cassandra/cassandra_functional_test.cc +++ b/utilities/cassandra/cassandra_functional_test.cc @@ -306,7 +306,6 @@ TEST_F(CassandraFunctionalTest, CompactionShouldRemoveTombstoneFromPut) { } // namespace rocksdb int main(int argc, char** argv) { - PhotonEnv::Singleton(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/utilities/merge_operators/string_append/stringappend_test.cc b/utilities/merge_operators/string_append/stringappend_test.cc index af725ae63d7..54c89a03abf 100644 --- a/utilities/merge_operators/string_append/stringappend_test.cc +++ b/utilities/merge_operators/string_append/stringappend_test.cc @@ -579,7 +579,6 @@ TEST_F(StringAppendOperatorTest, SimpleTestNullDelimiter) { int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); - rocksdb::PhotonEnv::Singleton(); // Run with regular database int result; { diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index 8881abbb07c..015185520a5 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -3168,7 +3168,6 @@ TEST_P(WritePreparedTransactionTest, WC_WP_WALForwardIncompatibility) { } // namespace rocksdb int main(int argc, char** argv) { - rocksdb::PhotonEnv::Singleton(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } From e97c3a7dd3a63a06fee69b6d40107d2374965709 Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Wed, 25 Sep 2024 23:36:57 +0800 Subject: [PATCH 17/29] fix prefix_test --- db/prefix_test.cc | 16 ++++++++++++---- db/version_edit.h | 16 ++++++++-------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/db/prefix_test.cc b/db/prefix_test.cc index ac854cb3dbd..affb2ffe4bd 100644 --- a/db/prefix_test.cc +++ b/db/prefix_test.cc @@ -43,13 +43,17 @@ DEFINE_uint64(num_locks, 10001, "number of locks"); DEFINE_bool(random_prefix, false, "randomize prefix"); DEFINE_uint64(total_prefixes, 100000, "total number of prefixes"); DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix"); -DEFINE_int64(write_buffer_size, 33554432, ""); -DEFINE_int32(max_write_buffer_number, 2, ""); -DEFINE_int32(min_write_buffer_number_to_merge, 1, ""); +// DEFINE_int64(write_buffer_size, 33554432, ""); +DECLARE_int64(write_buffer_size); +// DEFINE_int32(max_write_buffer_number, 2, ""); +DECLARE_int32(max_write_buffer_number); +// DEFINE_int32(min_write_buffer_number_to_merge, 1, ""); +DECLARE_int32(min_write_buffer_number_to_merge); DEFINE_int32(skiplist_height, 4, ""); DEFINE_double(memtable_prefix_bloom_size_ratio, 0.1, ""); DEFINE_int32(memtable_huge_page_size, 2 * 1024 * 1024, ""); -DEFINE_int32(value_size, 40, ""); +// DEFINE_int32(value_size, 40, ""); +DECLARE_int32(value_size); DEFINE_bool(enable_print, false, "Print options generated to console."); // Path to the database on file system @@ -876,6 +880,10 @@ TEST_F(PrefixTest, PrefixSeekModePrev3) { int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); ParseCommandLineFlags(&argc, &argv, true); + FLAGS_write_buffer_size = 33554432; + FLAGS_max_write_buffer_number = 2; + FLAGS_min_write_buffer_number_to_merge = 1; + FLAGS_value_size = 40; return RUN_ALL_TESTS(); } diff --git a/db/version_edit.h b/db/version_edit.h index ee6499cdc3b..d707dd35e71 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -52,14 +52,14 @@ struct FileDescriptor { smallest_seqno(_smallest_seqno), largest_seqno(_largest_seqno) {} - FileDescriptor& operator=(const FileDescriptor& fd) { - table_reader = fd.table_reader; - packed_number_and_path_id = fd.packed_number_and_path_id; - file_size = fd.file_size; - smallest_seqno = fd.smallest_seqno; - largest_seqno = fd.largest_seqno; - return *this; - } + // FileDescriptor& operator=(const FileDescriptor& fd) { + // table_reader = fd.table_reader; + // packed_number_and_path_id = fd.packed_number_and_path_id; + // file_size = fd.file_size; + // smallest_seqno = fd.smallest_seqno; + // largest_seqno = fd.largest_seqno; + // return *this; + // } uint64_t GetNumber() const { return packed_number_and_path_id & kFileNumberMask; From 0a76da797b4cf6c5683af97fdd85cb7309f7f078 Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Thu, 26 Sep 2024 00:25:52 +0800 Subject: [PATCH 18/29] improve perf --- examples/perf/perf-client.cpp | 1 + examples/perf/perf-server.cpp | 2 ++ 2 files changed, 3 insertions(+) diff --git a/examples/perf/perf-client.cpp b/examples/perf/perf-client.cpp index 628752ba6d1..a19bdf708a7 100644 --- a/examples/perf/perf-client.cpp +++ b/examples/perf/perf-client.cpp @@ -101,6 +101,7 @@ int main(int argc, char** argv) { FLAGS_port); auto pool = photon::rpc::new_stub_pool(-1, -1, -1); + DEFER(delete pool); if (FLAGS_type == "fill") { run_fill(ep, pool); diff --git a/examples/perf/perf-server.cpp b/examples/perf/perf-server.cpp index 4c6ea655f32..73a49d6f11e 100644 --- a/examples/perf/perf-server.cpp +++ b/examples/perf/perf-server.cpp @@ -145,6 +145,8 @@ class ExampleServer { // Optimize RocksDB. This is the easiest way to get RocksDB to perform well options.IncreaseParallelism(); options.OptimizeLevelStyleCompaction(); + options.allow_concurrent_memtable_write = false; + options.enable_pipelined_write = false; options.stats_dump_period_sec = 0; options.stats_persist_period_sec = 0; From 0cfb18df188918d424deb48df54172fe9a53aa90 Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Thu, 26 Sep 2024 00:27:44 +0800 Subject: [PATCH 19/29] update readme --- photon.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/photon.md b/photon.md index 7850bc26e10..efe1ff90362 100644 --- a/photon.md +++ b/photon.md @@ -26,4 +26,9 @@ cmake -B build -D WITH_TESTS=on -D INIT_PHOTON_IN_ROCKSDB=on -D FAIL_ON_WARNINGS cmake --build build -j `nproc` ulimit -n 100000 cd build && ctest . +``` + +```bash +# TODO +-D PHOTON_ENABLE_URING=on ``` \ No newline at end of file From f5f9af204131234b1b129ead4cf2bcd06fe147c4 Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Fri, 27 Sep 2024 16:24:09 +0800 Subject: [PATCH 20/29] update bench doc --- photon-bench.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/photon-bench.md b/photon-bench.md index abcff40e9c8..ceb25fcc29f 100644 --- a/photon-bench.md +++ b/photon-bench.md @@ -11,6 +11,8 @@ export NUM_THREADS=16 export KEY_SIZE=100 export NUM_KEYS=10000000 # Require 10 GB disk space export VALUE_SIZE=1024 +export DURATION=300 +# export CACHE_SIZE=0 # For large number of threads, you may edit benchmark.sh, and add `taskset -c 1-8` before the ./db_bench command. # This would limit the CPU number for both thread and coroutine, in order to make a fair comparison. From f0c485bb66b708784eee42062576fcadc07b0728 Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Thu, 3 Oct 2024 16:28:28 +0800 Subject: [PATCH 21/29] build ok --- CMakeLists.txt | 12 +++++++----- env/env_posix.cc | 2 +- examples/perf/perf-client.cpp | 2 +- photon.md | 10 +++++----- port/port_posix.cc | 16 ++++++++-------- 5 files changed, 22 insertions(+), 20 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 98aa3877efb..5c5be3cf652 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,8 +48,10 @@ include(FetchContent) set(PHOTON_ENABLE_URING OFF CACHE INTERNAL "Enable iouring") FetchContent_Declare( photon - GIT_REPOSITORY https://github.com/alibaba/PhotonLibOS.git - GIT_TAG release/0.8 + # GIT_REPOSITORY https://github.com/alibaba/PhotonLibOS.git + GIT_REPOSITORY https://github.com/beef9999/PhotonLibOS.git + # GIT_TAG release/0.8 + GIT_TAG fix-more-Wshadow-warnings ) FetchContent_MakeAvailable(photon) set(PHOTON_INCLUDE_DIR ${photon_SOURCE_DIR}/include/) @@ -58,9 +60,9 @@ if (PHOTON_ENABLE_URING) add_compile_definitions("PHOTON_ENABLE_URING") endif () -option(INIT_PHOTON_IN_ROCKSDB "INIT PHOTON IN ROCKSDB" OFF) -if(INIT_PHOTON_IN_ROCKSDB) - add_definitions(-DINIT_PHOTON_IN_ROCKSDB) +option(INIT_PHOTON_IN_ENV "INIT PHOTON IN ROCKSDB" OFF) +if(INIT_PHOTON_IN_ENV) + add_definitions(-DINIT_PHOTON_IN_ENV) endif() option(WITH_JEMALLOC "build with JeMalloc" OFF) diff --git a/env/env_posix.cc b/env/env_posix.cc index 856490ce673..088129eae8e 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -1130,7 +1130,7 @@ Env* Env::Default() { // of their construction, having this call here guarantees that // the destructor of static PosixEnv will go first, then the // the singletons of ThreadLocalPtr. -#ifdef INIT_PHOTON_IN_ROCKSDB +#ifdef INIT_PHOTON_IN_ENV PhotonEnv::Singleton(); #endif ThreadLocalPtr::InitSingletons(); diff --git a/examples/perf/perf-client.cpp b/examples/perf/perf-client.cpp index a19bdf708a7..0f31c9e595f 100644 --- a/examples/perf/perf-client.cpp +++ b/examples/perf/perf-client.cpp @@ -63,7 +63,7 @@ void run_get(photon::net::EndPoint ep, photon::rpc::StubPool* pool) { KvGet::Response resp; ret = stub->call(req, resp); - if (ret < 0 || resp.ret != 0 || resp.value.size() != FLAGS_value_size) { + if (ret < 0 || resp.ret != 0 || resp.value.size() != (uint64_t) FLAGS_value_size) { abort(); } } diff --git a/photon.md b/photon.md index efe1ff90362..1c2e2eebece 100644 --- a/photon.md +++ b/photon.md @@ -2,29 +2,29 @@ ```bash # Install dependencies +dnf install gflags-devel snappy-devel zlib-devel bzip2-devel lz4-devel libzstd-devel apt install libgflags-dev libsnappy-dev zlib1g-dev libbz2-dev liblz4-dev libzstd-dev # Build performance test on RPC client/server (Photon RocksDB) ./photon-auto-convert.sh -cmake -B build -D INIT_PHOTON_IN_ROCKSDB=off -D FAIL_ON_WARNINGS=off -D WITH_LZ4=on -D WITH_SNAPPY=on -D CMAKE_BUILD_TYPE=Release +cmake -B build -D INIT_PHOTON_IN_ENV=off -D WITH_LZ4=on -D WITH_SNAPPY=on -D CMAKE_BUILD_TYPE=Release cmake --build build -t perf-client -t perf-server -j `nproc` # Build performance test on RPC client/server (Native RocksDB) git checkout 6.1.2 git checkout photon-on-6.1.2 -- examples/ CMakeLists.txt -cmake -B build -D INIT_PHOTON_IN_ROCKSDB=off -D FAIL_ON_WARNINGS=off -D WITH_LZ4=on -D WITH_SNAPPY=on -D CMAKE_BUILD_TYPE=Release +cmake -B build -D INIT_PHOTON_IN_ENV=off -D WITH_LZ4=on -D WITH_SNAPPY=on -D CMAKE_BUILD_TYPE=Release cmake --build build -t perf-client -t perf-server -j `nproc` # Build db_bench ./photon-auto-convert.sh -cmake -B build -D INIT_PHOTON_IN_ROCKSDB=on -D FAIL_ON_WARNINGS=off -D WITH_LZ4=on -D WITH_SNAPPY=on -D CMAKE_BUILD_TYPE=Release +cmake -B build -D INIT_PHOTON_IN_ENV=on -D WITH_LZ4=on -D WITH_SNAPPY=on -D CMAKE_BUILD_TYPE=Release cmake --build build -t db_bench -j `nproc` # Build CI tests ./photon-auto-convert.sh -cmake -B build -D WITH_TESTS=on -D INIT_PHOTON_IN_ROCKSDB=on -D FAIL_ON_WARNINGS=off -D WITH_LZ4=on -D WITH_SNAPPY=on -D CMAKE_BUILD_TYPE=Debug +cmake -B build -D WITH_TESTS=on -D INIT_PHOTON_IN_ENV=on -D WITH_LZ4=on -D WITH_SNAPPY=on -D CMAKE_BUILD_TYPE=Debug cmake --build build -j `nproc` -ulimit -n 100000 cd build && ctest . ``` diff --git a/port/port_posix.cc b/port/port_posix.cc index 1cbc41e0935..455d6eb1eb6 100644 --- a/port/port_posix.cc +++ b/port/port_posix.cc @@ -42,13 +42,13 @@ extern const bool kDefaultToAdaptiveMutex = false; namespace port { -static int PthreadCall(const char* label, int result) { - if (result != 0 && result != ETIMEDOUT) { - fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); - abort(); - } - return result; -} +// static int PthreadCall(const char* label, int result) { +// if (result != 0 && result != ETIMEDOUT) { +// fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); +// abort(); +// } +// return result; +// } Mutex::Mutex(bool adaptive) { } @@ -97,7 +97,7 @@ bool CondVar::TimedWait(uint64_t abs_time_us) { #endif auto abs_now_us = std::chrono::duration_cast( std::chrono::system_clock::now().time_since_epoch()).count(); - uint64_t timeout = abs_time_us > abs_now_us ? abs_time_us - abs_now_us : 0; + uint64_t timeout = abs_time_us > uint64_t(abs_now_us) ? abs_time_us - abs_now_us : 0; int ret = cv_.wait(mu_->mu_, timeout); #ifndef NDEBUG mu_->locked_ = true; From 6ce8866dc3e641f8ef45e52acac548150679db6f Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Sun, 6 Oct 2024 16:41:38 +0800 Subject: [PATCH 22/29] change vcpu from 16 to 8 --- include/rocksdb/env.h | 6 +++--- photon-bench.md | 19 +++++++++++-------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 35f503fe2f1..588bb824757 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -1458,11 +1458,11 @@ Env* NewTimedEnv(Env* base_env); class PhotonEnv { public: static PhotonEnv& Singleton() { - // 16 vCPU. Hardcoded for now. + // 8 vCPU. Hardcoded for now. #ifdef PHOTON_ENABLE_URING - static PhotonEnv instance(16, photon::INIT_EVENT_IOURING); + static PhotonEnv instance(8, photon::INIT_EVENT_IOURING); #else - static PhotonEnv instance(16, photon::INIT_EVENT_EPOLL); + static PhotonEnv instance(8, photon::INIT_EVENT_EPOLL); #endif return instance; } diff --git a/photon-bench.md b/photon-bench.md index ceb25fcc29f..3cad580a4ed 100644 --- a/photon-bench.md +++ b/photon-bench.md @@ -3,19 +3,22 @@ ```bash cd build cp ../tools/benchmark.sh . + export DB_DIR=`pwd`/test-db export WAL_DIR=$DB_DIR export OUTPUT_DIR=$DB_DIR export COMPRESSION_TYPE=none -export NUM_THREADS=16 -export KEY_SIZE=100 -export NUM_KEYS=10000000 # Require 10 GB disk space -export VALUE_SIZE=1024 -export DURATION=300 -# export CACHE_SIZE=0 +export NUM_THREADS=64 # Concurrency +export KEY_SIZE=20 +export VALUE_SIZE=400 +export NUM_KEYS=100000000 # Require 40 GB disk space +export CACHE_SIZE=0 # Disable block cache. Need to remove --pin_l0_filter_and_index_blocks_in_cache=1 argument from benchmark.sh +export DURATION=60 # Only run 1 minutes -# For large number of threads, you may edit benchmark.sh, and add `taskset -c 1-8` before the ./db_bench command. -# This would limit the CPU number for both thread and coroutine, in order to make a fair comparison. +# In env.h, Photon now has hardcoded to use 8 vCPUs. +# In order to make a fair comparison, you may edit benchmark.sh, and add `taskset -c 1-8` before the ./db_bench command. +# This would limit the CPU number for both thread and coroutine. +# But it's not necessary, because essentially these two concurrency models are quite different. # Clean page cache before every test echo 3 > /proc/sys/vm/drop_caches From d149db7ee84eaa542f3c994f9503294918abe3e7 Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Sun, 6 Oct 2024 21:31:23 +0800 Subject: [PATCH 23/29] update perf-server --- examples/perf/perf-server.cpp | 12 ++++++------ include/rocksdb/options.h | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/perf/perf-server.cpp b/examples/perf/perf-server.cpp index 73a49d6f11e..5f870a75357 100644 --- a/examples/perf/perf-server.cpp +++ b/examples/perf/perf-server.cpp @@ -142,15 +142,15 @@ class ExampleServer { } int run() { - // Optimize RocksDB. This is the easiest way to get RocksDB to perform well - options.IncreaseParallelism(); + // Optimize RocksDB + if (FLAGS_use_photon) { + options.IncreaseParallelism(256); + } else { + options.IncreaseParallelism(); + } options.OptimizeLevelStyleCompaction(); options.allow_concurrent_memtable_write = false; options.enable_pipelined_write = false; - - options.stats_dump_period_sec = 0; - options.stats_persist_period_sec = 0; - options.enable_pipelined_write = true; options.compression = rocksdb::CompressionType::kNoCompression; // create the DB if it's not already present options.create_if_missing = true; diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 1b1a1dc53c3..f7d6dfaf58d 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -357,7 +357,7 @@ struct DBOptions { // `total_threads` is used. Good value for `total_threads` is the number of // cores. You almost definitely want to call this function if your system is // bottlenecked by RocksDB. - DBOptions* IncreaseParallelism(int total_threads = 128); + DBOptions* IncreaseParallelism(int total_threads = 16); #endif // ROCKSDB_LITE // If true, the database will be created if it is missing. From f737090c35d66d48d61e00849c18d5ec7859113b Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Sun, 6 Oct 2024 21:43:27 +0800 Subject: [PATCH 24/29] update perf --- examples/perf/perf-client.cpp | 4 ++-- examples/perf/perf-server.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/perf/perf-client.cpp b/examples/perf/perf-client.cpp index 0f31c9e595f..b8945967799 100644 --- a/examples/perf/perf-client.cpp +++ b/examples/perf/perf-client.cpp @@ -17,7 +17,7 @@ DEFINE_int32(value_size, 256 * 1024, "value size"); static std::string random_value(size_t size) { static std::random_device rd; - static thread_local std::mt19937 gen(rd()); + static thread_local std::mt19937_64 gen(rd()); static const char alphabet[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; std::string s; s.resize(size); @@ -29,7 +29,7 @@ static std::string random_value(size_t size) { static std::string random_key() { static std::random_device rd; - static thread_local std::mt19937 gen(rd()); + static thread_local std::mt19937_64 gen(rd()); return std::to_string(gen() % FLAGS_key_num); } diff --git a/examples/perf/perf-server.cpp b/examples/perf/perf-server.cpp index 5f870a75357..995bebb15b9 100644 --- a/examples/perf/perf-server.cpp +++ b/examples/perf/perf-server.cpp @@ -145,12 +145,12 @@ class ExampleServer { // Optimize RocksDB if (FLAGS_use_photon) { options.IncreaseParallelism(256); + options.allow_concurrent_memtable_write = false; + options.enable_pipelined_write = false; } else { options.IncreaseParallelism(); } options.OptimizeLevelStyleCompaction(); - options.allow_concurrent_memtable_write = false; - options.enable_pipelined_write = false; options.compression = rocksdb::CompressionType::kNoCompression; // create the DB if it's not already present options.create_if_missing = true; From d8f913249ca5b5a95b0e8e0cb488a96eee29b957 Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Sun, 6 Oct 2024 21:49:59 +0800 Subject: [PATCH 25/29] update perf --- examples/perf/perf-server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/perf/perf-server.cpp b/examples/perf/perf-server.cpp index 995bebb15b9..8af035c4924 100644 --- a/examples/perf/perf-server.cpp +++ b/examples/perf/perf-server.cpp @@ -148,7 +148,7 @@ class ExampleServer { options.allow_concurrent_memtable_write = false; options.enable_pipelined_write = false; } else { - options.IncreaseParallelism(); + options.IncreaseParallelism(8); } options.OptimizeLevelStyleCompaction(); options.compression = rocksdb::CompressionType::kNoCompression; From bb3c7805ca1476d98a2ca251d73370c0a87b8a5d Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Sun, 6 Oct 2024 22:09:13 +0800 Subject: [PATCH 26/29] update photon tag --- CMakeLists.txt | 8 +++----- photon.md | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5c5be3cf652..9279176d99f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,10 +48,8 @@ include(FetchContent) set(PHOTON_ENABLE_URING OFF CACHE INTERNAL "Enable iouring") FetchContent_Declare( photon - # GIT_REPOSITORY https://github.com/alibaba/PhotonLibOS.git - GIT_REPOSITORY https://github.com/beef9999/PhotonLibOS.git - # GIT_TAG release/0.8 - GIT_TAG fix-more-Wshadow-warnings + GIT_REPOSITORY https://github.com/alibaba/PhotonLibOS.git + GIT_TAG release/0.8 ) FetchContent_MakeAvailable(photon) set(PHOTON_INCLUDE_DIR ${photon_SOURCE_DIR}/include/) @@ -195,7 +193,7 @@ if(MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W4 /wd4127 /wd4800 /wd4996 /wd4351 /wd4100 /wd4204 /wd4324") else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -W -Wextra -Wall") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wsign-compare -Wshadow -Wno-unused-parameter -Wno-unused-variable -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers -Wno-strict-aliasing") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wsign-compare -Wno-unused-parameter -Wno-unused-variable -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers -Wno-strict-aliasing") if(MINGW) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-format") endif() diff --git a/photon.md b/photon.md index 1c2e2eebece..71db5c4212b 100644 --- a/photon.md +++ b/photon.md @@ -12,7 +12,7 @@ cmake --build build -t perf-client -t perf-server -j `nproc` # Build performance test on RPC client/server (Native RocksDB) git checkout 6.1.2 -git checkout photon-on-6.1.2 -- examples/ CMakeLists.txt +git checkout origin/photon-on-6.1.2 -- examples/ CMakeLists.txt cmake -B build -D INIT_PHOTON_IN_ENV=off -D WITH_LZ4=on -D WITH_SNAPPY=on -D CMAKE_BUILD_TYPE=Release cmake --build build -t perf-client -t perf-server -j `nproc` From f0204f9c1828d49f9cc6a00df7a52f7995caef75 Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Sun, 20 Oct 2024 20:22:50 +0800 Subject: [PATCH 27/29] photon fetch-content changes to submodule --- .gitmodules | 4 ++++ CMakeLists.txt | 17 ++++------------- third-party/PhotonLibOS | 1 + 3 files changed, 9 insertions(+), 13 deletions(-) create mode 100644 .gitmodules create mode 160000 third-party/PhotonLibOS diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000000..8bafe46addf --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "third-party/PhotonLibOS"] + path = third-party/PhotonLibOS + url = https://github.com/alibaba/PhotonLibOS.git + branch = release/0.8 diff --git a/CMakeLists.txt b/CMakeLists.txt index 9279176d99f..869c7614852 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,23 +44,15 @@ endif() list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/modules/") -include(FetchContent) +# Photon set(PHOTON_ENABLE_URING OFF CACHE INTERNAL "Enable iouring") -FetchContent_Declare( - photon - GIT_REPOSITORY https://github.com/alibaba/PhotonLibOS.git - GIT_TAG release/0.8 -) -FetchContent_MakeAvailable(photon) -set(PHOTON_INCLUDE_DIR ${photon_SOURCE_DIR}/include/) - +add_subdirectory(third-party/PhotonLibOS) if (PHOTON_ENABLE_URING) add_compile_definitions("PHOTON_ENABLE_URING") endif () - option(INIT_PHOTON_IN_ENV "INIT PHOTON IN ROCKSDB" OFF) if(INIT_PHOTON_IN_ENV) - add_definitions(-DINIT_PHOTON_IN_ENV) + add_compile_definitions("INIT_PHOTON_IN_ENV") endif() option(WITH_JEMALLOC "build with JeMalloc" OFF) @@ -477,8 +469,7 @@ endif() include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_SOURCE_DIR}/include) -include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/third-party/gtest-1.7.0/fused-src) -include_directories(${PHOTON_INCLUDE_DIR}) +include_directories(SYSTEM ${PROJECT_SOURCE_DIR}/third-party/gtest-1.7.0/fused-src ${PROJECT_SOURCE_DIR}/third-party/PhotonLibOS/include) find_package(Threads REQUIRED) add_subdirectory(third-party/gtest-1.7.0/fused-src/gtest) diff --git a/third-party/PhotonLibOS b/third-party/PhotonLibOS new file mode 160000 index 00000000000..01cde1ca6b8 --- /dev/null +++ b/third-party/PhotonLibOS @@ -0,0 +1 @@ +Subproject commit 01cde1ca6b860cc440b3f219681fb7d177a67fb4 From f522fc5382dd23139fd96a9f742d9337ace4ebb9 Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Sun, 20 Oct 2024 20:28:24 +0800 Subject: [PATCH 28/29] Update photon-auto-convert.sh --- photon-auto-convert.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/photon-auto-convert.sh b/photon-auto-convert.sh index 49a9e525db4..08d4e5ac2c2 100755 --- a/photon-auto-convert.sh +++ b/photon-auto-convert.sh @@ -2,8 +2,8 @@ set -e -cc_files=$(find . -type f -name "*.cc" -not -path "./build/*") -h_files=$(find . -type f -name "*.h" -not -path "./build/*") +cc_files=$(find . -type f -name "*.cc" -not -path "./build/*" -not -path "./third-party/PhotonLibOS/*") +h_files=$(find . -type f -name "*.h" -not -path "./build/*" -not -path "./third-party/PhotonLibOS/*") files="${cc_files} ${h_files}" sed -i 's|#include |#include "port/port.h"|g' $files From 1b651b89e8cdc8331b8dc55f62d5b0cb1b273a57 Mon Sep 17 00:00:00 2001 From: Chen Bo Date: Mon, 21 Oct 2024 12:18:11 +0800 Subject: [PATCH 29/29] update new_stub_pool API --- examples/perf/perf-client.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/perf/perf-client.cpp b/examples/perf/perf-client.cpp index b8945967799..fe0d1ecc983 100644 --- a/examples/perf/perf-client.cpp +++ b/examples/perf/perf-client.cpp @@ -100,7 +100,7 @@ int main(int argc, char** argv) { auto ep = photon::net::EndPoint(photon::net::IPAddr(FLAGS_host.c_str()), FLAGS_port); - auto pool = photon::rpc::new_stub_pool(-1, -1, -1); + auto pool = photon::rpc::new_stub_pool(-1, -1); DEFER(delete pool); if (FLAGS_type == "fill") { @@ -116,4 +116,4 @@ int main(int argc, char** argv) { } photon::thread_sleep(-1); } -} \ No newline at end of file +}