From 68ac66ef2c826a081efd766c33a233f5c6a91bf6 Mon Sep 17 00:00:00 2001 From: Hari Krishna Sunder Date: Fri, 16 Aug 2024 12:22:49 -0700 Subject: [PATCH] [#23492] DocDB: Upgrade and Rollback tests Summary: Adding the framework to perform DB upgrades and rollbacks in unit tests. - `build.xml` stores the location of the builds for various os architecture and build types. We current use `linux_debug_x86`, `linux_release_x86`, `darwin_debug_arm64` and `darwin_release_arm64`. Jenkins covers all linux and darwin release types. Darwin debug is for use on dev mac machines. - Currently supports `2.20.2.4` and `2024.1.0.1` as older versions. New major versions will be added manually as they become available. Minor versions can be added on demand if the need for one arrises. - 2.20 linux builds are disabled since `post_install.sh` get stuck. - Builds are downloaded to the `/opt/yb-build/db-upgrade` directory if it does not already exist. - We do not maintain builds for ASAN and TSAN so these build types are not run. - `UpgradeTestBase` provides the framework to perform all Upgrade and Rollback actions. - `BasicUpgradeTest` tests upgrade and rollback using a simple bank balance workload. - D32492 changed the DocDB format for debug builds so that it is compatible with release builds. This change was not backported, so debug builds on version 2024.2, or older are generated after patching this change. #23492 Jira: DB-12406 Test Plan: BasicUpgradeTest, TestUpgradeFrom_2_20_2_4 BasicUpgradeTest, TestRollbackTo_2_20_2_4 BasicUpgradeTest, TestUpgradeFrom_2024_1_0_1 BasicUpgradeTest, TestRollbackTo_2024_1_0_1 Reviewers: asrivastava, tfoucher, slingam Reviewed By: asrivastava Subscribers: ybase Differential Revision: https://phorge.dev.yugabyte.com/D37153 --- python/yugabyte/yb_dist_tests.py | 1 + src/yb/integration-tests/CMakeLists.txt | 14 +- src/yb/integration-tests/auto_flags-itest.cc | 16 +- .../cassandra_cpp_driver-test.cc | 8 +- src/yb/integration-tests/external_daemon.h | 3 +- .../external_mini_cluster-itest-base.cc | 87 +++ .../external_mini_cluster-itest-base.h | 66 +-- .../external_mini_cluster.cc | 12 + .../integration-tests/external_mini_cluster.h | 25 +- src/yb/integration-tests/log_version-test.cc | 6 +- .../upgrade-tests/basic_upgrade-test.cc | 262 +++++++++ .../upgrade-tests/builds.xml | 45 ++ .../upgrade-tests/upgrade_test_base.cc | 496 ++++++++++++++++++ .../upgrade-tests/upgrade_test_base.h | 110 ++++ src/yb/util/status_format.h | 8 +- 15 files changed, 1066 insertions(+), 93 deletions(-) create mode 100644 src/yb/integration-tests/external_mini_cluster-itest-base.cc create mode 100644 src/yb/integration-tests/upgrade-tests/basic_upgrade-test.cc create mode 100644 src/yb/integration-tests/upgrade-tests/builds.xml create mode 100644 src/yb/integration-tests/upgrade-tests/upgrade_test_base.cc create mode 100644 src/yb/integration-tests/upgrade-tests/upgrade_test_base.h diff --git a/python/yugabyte/yb_dist_tests.py b/python/yugabyte/yb_dist_tests.py index acc95ccf346e..3f0787d4d178 100644 --- a/python/yugabyte/yb_dist_tests.py +++ b/python/yugabyte/yb_dist_tests.py @@ -192,6 +192,7 @@ def set_global_conf_from_dict(global_conf_dict: Dict[str, str]) -> GlobalTestCon 'linuxbrew_path.txt', 'thirdparty_path.txt', 'thirdparty_url.txt', + 'upgrade_test_builds', f'{POSTGRES_BUILD_SUBDIR}/contrib', f'{POSTGRES_BUILD_SUBDIR}/src/test/regress', f'{POSTGRES_BUILD_SUBDIR}/src/test/isolation', diff --git a/src/yb/integration-tests/CMakeLists.txt b/src/yb/integration-tests/CMakeLists.txt index ddf7de355eb5..93db073bc131 100644 --- a/src/yb/integration-tests/CMakeLists.txt +++ b/src/yb/integration-tests/CMakeLists.txt @@ -55,6 +55,14 @@ file(COPY add_custom_target( xcluster_ddl_replication_sql DEPENDS ${CMAKE_BINARY_DIR}/test_xcluster_ddl_replication_sql) +configure_file( + ${CMAKE_SOURCE_DIR}/src/yb/integration-tests/upgrade-tests/builds.xml + ${CMAKE_BINARY_DIR}/upgrade_test_builds/builds.xml + COPYONLY) + +add_custom_target( + upgrade_test_builds_tar DEPENDS ${CMAKE_BINARY_DIR}/upgrade_test_builds) + ADD_YB_TEST_LIBRARY( cdc_test_util SRCS cdc_test_util.cc @@ -68,6 +76,7 @@ set(INTEGRATION_TESTS_SRCS external_daemon.cc external_yb_controller.cc external_mini_cluster_fs_inspector.cc + external_mini_cluster-itest-base.cc external_mini_cluster_validator.cc load_balancer_test_util.cc load_generator.cc @@ -88,6 +97,7 @@ set(INTEGRATION_TESTS_SRCS xcluster/xcluster_ysql_test_base.cc xcluster/xcluster_ycql_test_base.cc xcluster/xcluster_ddl_replication_test_base.cc + upgrade-tests/upgrade_test_base.cc ) ADD_YB_TEST_LIBRARY(integration-tests SRCS ${INTEGRATION_TESTS_SRCS}) @@ -119,7 +129,8 @@ add_dependencies(integration-tests yb-ts-cli generate_test_certs fips_install - xcluster_ddl_replication_sql) + xcluster_ddl_replication_sql + upgrade_test_builds_tar) ADD_YB_TEST_LIBRARY( cql_test_util @@ -249,6 +260,7 @@ ADD_YB_TEST(xcluster/xcluster-tablet-split-itest) ADD_YB_TEST(xcluster/xcluster-test) ADD_YB_TEST(xcluster/xcluster_outbound_replication_group-itest) ADD_YB_TEST(retryable_request-test) +ADD_YB_TEST(upgrade-tests/basic_upgrade-test) set(YB_TEST_LINK_LIBS_SAVED ${YB_TEST_LINK_LIBS}) set(YB_TEST_LINK_LIBS ${YB_TEST_LINK_LIBS} cassandra) diff --git a/src/yb/integration-tests/auto_flags-itest.cc b/src/yb/integration-tests/auto_flags-itest.cc index c1717cba3bd4..8e4555fc5743 100644 --- a/src/yb/integration-tests/auto_flags-itest.cc +++ b/src/yb/integration-tests/auto_flags-itest.cc @@ -815,11 +815,6 @@ class AutoFlagsExternalMiniClusterTest : public ExternalMiniClusterITestBase { StartCluster(extra_ts_flags, extra_master_flags, kNumTServers, kNumMasterServers)); } - void SetUpCluster(ExternalMiniClusterOptions* opts) override { - ASSERT_NO_FATALS(ExternalMiniClusterITestBase::SetUpCluster(opts)); - opts_ = *opts; - } - Status CheckFlagOnNode( const string& flag_name, const string& expected_val, ExternalDaemon* daemon) { auto value = VERIFY_RESULT(daemon->GetFlag(flag_name)); @@ -999,9 +994,6 @@ class AutoFlagsExternalMiniClusterTest : public ExternalMiniClusterITestBase { CHECK_EQ(expected_config_version, config.config_version()); return config; } - - protected: - ExternalMiniClusterOptions opts_; }; // Validate AutoFlags in new cluster and make sure it handles process restarts, and addition of @@ -1023,7 +1015,7 @@ TEST_F(AutoFlagsExternalMiniClusterTest, NewCluster) { ASSERT_OK(CheckFlagOnNode(kTESTAutoFlagsNewInstallFlagName, kTrue, new_master.get())); ASSERT_OK(cluster_->AddTabletServer()); - ASSERT_OK(cluster_->WaitForTabletServerCount(opts_.num_tablet_servers + 1, kTimeout)); + ASSERT_OK(cluster_->WaitForTabletServerCount(cluster_->num_tablet_servers(), kTimeout)); ASSERT_OK(CheckFlagOnAllNodes(kTESTAutoFlagsInitializedFlagName, kTrue)); ASSERT_OK(CheckFlagOnAllNodes(kTESTAutoFlagsNewInstallFlagName, kTrue)); @@ -1064,7 +1056,7 @@ TEST_F(AutoFlagsExternalMiniClusterTest, UpgradeCluster) { ASSERT_TRUE(Erase(disable_auto_flag_management, cluster_->mutable_extra_tserver_flags())); ASSERT_OK(cluster_->AddTabletServer()); - ASSERT_OK(cluster_->WaitForTabletServerCount(opts_.num_tablet_servers + 1, kTimeout)); + ASSERT_OK(cluster_->WaitForTabletServerCount(cluster_->num_tablet_servers(), kTimeout)); // Add a new tserver auto* new_tserver = cluster_->tablet_server(cluster_->num_tablet_servers() - 1); @@ -1075,9 +1067,9 @@ TEST_F(AutoFlagsExternalMiniClusterTest, UpgradeCluster) { // Restart the new tserver new_tserver->Shutdown(); - ASSERT_OK(cluster_->WaitForTabletServerCount(opts_.num_tablet_servers, kTimeout)); + ASSERT_OK(cluster_->WaitForTabletServerCount(cluster_->num_tablet_servers() - 1, kTimeout)); ASSERT_OK(new_tserver->Restart()); - ASSERT_OK(cluster_->WaitForTabletServerCount(opts_.num_tablet_servers + 1, kTimeout)); + ASSERT_OK(cluster_->WaitForTabletServerCount(cluster_->num_tablet_servers(), kTimeout)); ASSERT_OK(CheckFlagOnNode(kDisableAutoFlagsManagementFlagName, kFalse, new_tserver)); ASSERT_OK(CheckFlagOnNode(kTESTAutoFlagsInitializedFlagName, kFalse, new_tserver)); ASSERT_OK(CheckFlagOnNode(kTESTAutoFlagsNewInstallFlagName, kFalse, new_tserver)); diff --git a/src/yb/integration-tests/cassandra_cpp_driver-test.cc b/src/yb/integration-tests/cassandra_cpp_driver-test.cc index 537e47a66464..d32c243b34ab 100644 --- a/src/yb/integration-tests/cassandra_cpp_driver-test.cc +++ b/src/yb/integration-tests/cassandra_cpp_driver-test.cc @@ -126,11 +126,11 @@ class CppCassandraDriverTest : public ExternalMiniClusterITestBase { ASSERT_OK(admin_client_->Init()); } - void SetUpCluster(ExternalMiniClusterOptions* opts) override { - ASSERT_NO_FATALS(ExternalMiniClusterITestBase::SetUpCluster(opts)); + void SetUpOptions(ExternalMiniClusterOptions& opts) override { + ASSERT_NO_FATALS(ExternalMiniClusterITestBase::SetUpOptions(opts)); - opts->bind_to_unique_loopback_addresses = true; - opts->use_same_ts_ports = true; + opts.bind_to_unique_loopback_addresses = true; + opts.use_same_ts_ports = true; } void TearDown() override { diff --git a/src/yb/integration-tests/external_daemon.h b/src/yb/integration-tests/external_daemon.h index fe803e6bb9de..2a2f6952ec98 100644 --- a/src/yb/integration-tests/external_daemon.h +++ b/src/yb/integration-tests/external_daemon.h @@ -135,6 +135,7 @@ class ExternalDaemon : public RefCountedThreadSafe { std::vector GetDataDirs() const { return data_dirs_; } const std::string& exe() const { return exe_; } + void SetExe(const std::string& new_exe) { exe_ = new_exe; } const std::string& GetRootDir() const { return root_dir_; } @@ -275,7 +276,7 @@ class ExternalDaemon : public RefCountedThreadSafe { const std::string daemon_id_; rpc::Messenger* messenger_; rpc::ProxyCache* proxy_cache_; - const std::string exe_; + std::string exe_; const std::string root_dir_; std::vector data_dirs_; std::vector extra_flags_; diff --git a/src/yb/integration-tests/external_mini_cluster-itest-base.cc b/src/yb/integration-tests/external_mini_cluster-itest-base.cc new file mode 100644 index 000000000000..07a8e08fab60 --- /dev/null +++ b/src/yb/integration-tests/external_mini_cluster-itest-base.cc @@ -0,0 +1,87 @@ +// Copyright (c) YugabyteDB, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations +// under the License. +// + +#include "yb/integration-tests/external_mini_cluster-itest-base.h" + +namespace yb { + +void ExternalMiniClusterITestBase::SetUpOptions(ExternalMiniClusterOptions& opts) { + // Fsync causes flakiness on EC2. + opts.extra_tserver_flags.push_back("--never_fsync"); +} + +void ExternalMiniClusterITestBase::StartCluster( + const std::vector& extra_ts_flags, + const std::vector& extra_master_flags, int num_tablet_servers, int num_masters, + bool enable_ysql) { + ExternalMiniClusterOptions opts; + opts.num_masters = num_masters; + opts.num_tablet_servers = num_tablet_servers; + opts.extra_master_flags = extra_master_flags; + opts.extra_tserver_flags = extra_ts_flags; + opts.enable_ysql = enable_ysql; + + ASSERT_OK(StartCluster(opts)); +} + +Status ExternalMiniClusterITestBase::StartCluster(ExternalMiniClusterOptions opts) { + SetUpOptions(opts); + + cluster_.reset(new ExternalMiniCluster(opts)); + RETURN_NOT_OK(cluster_->Start()); + inspect_.reset(new itest::ExternalMiniClusterFsInspector(cluster_.get())); + ts_map_ = VERIFY_RESULT(itest::CreateTabletServerMap(cluster_.get())); + client_ = VERIFY_RESULT(cluster_->CreateClient()); + + return Status::OK(); +} + +void ExternalMiniClusterITestBase::TearDown() { + client_.reset(); + if (cluster_) { + if (HasFatalFailure()) { + LOG(INFO) << "Found fatal failure"; + for (size_t i = 0; i < cluster_->num_tablet_servers(); i++) { + if (!cluster_->tablet_server(i)->IsProcessAlive()) { + LOG(INFO) << "Tablet server " << i << " is not running. Cannot dump its stacks."; + continue; + } + LOG(INFO) << "Attempting to dump stacks of TS " << i << " with UUID " + << cluster_->tablet_server(i)->uuid() << " and pid " + << cluster_->tablet_server(i)->pid(); + WARN_NOT_OK( + PstackWatcher::DumpPidStacks(cluster_->tablet_server(i)->pid()), + "Couldn't dump stacks"); + } + } + cluster_->Shutdown(); + } + YBTest::TearDown(); + ts_map_.clear(); +} + +Result ExternalMiniClusterITestBase::GetSingleTabletId(const TableName& table_name) { + TabletId tablet_id_to_split; + for (size_t i = 0; i < cluster_->num_tablet_servers(); ++i) { + const auto ts = cluster_->tablet_server(i); + const auto tablets = VERIFY_RESULT(cluster_->GetTablets(ts)); + for (const auto& tablet : tablets) { + if (tablet.table_name() == table_name) { + return tablet.tablet_id(); + } + } + } + return STATUS(NotFound, Format("No tablet found for table $0.", table_name)); +} + +} // namespace yb diff --git a/src/yb/integration-tests/external_mini_cluster-itest-base.h b/src/yb/integration-tests/external_mini_cluster-itest-base.h index 377e96beac4f..8d834f91af61 100644 --- a/src/yb/integration-tests/external_mini_cluster-itest-base.h +++ b/src/yb/integration-tests/external_mini_cluster-itest-base.h @@ -57,79 +57,25 @@ namespace yb { // setup routines useful for integration tests. class ExternalMiniClusterITestBase : public YBTest { public: - virtual void SetUpCluster(ExternalMiniClusterOptions* opts) { - // Fsync causes flakiness on EC2. - CHECK_NOTNULL(opts)->extra_tserver_flags.push_back("--never_fsync"); - } + virtual void TearDown() override; - virtual void TearDown() override { - client_.reset(); - if (cluster_) { - if (HasFatalFailure()) { - LOG(INFO) << "Found fatal failure"; - for (size_t i = 0; i < cluster_->num_tablet_servers(); i++) { - if (!cluster_->tablet_server(i)->IsProcessAlive()) { - LOG(INFO) << "Tablet server " << i << " is not running. Cannot dump its stacks."; - continue; - } - LOG(INFO) << "Attempting to dump stacks of TS " << i - << " with UUID " << cluster_->tablet_server(i)->uuid() - << " and pid " << cluster_->tablet_server(i)->pid(); - WARN_NOT_OK(PstackWatcher::DumpPidStacks(cluster_->tablet_server(i)->pid()), - "Couldn't dump stacks"); - } - } - cluster_->Shutdown(); - } - YBTest::TearDown(); - ts_map_.clear(); - } - - Result GetSingleTabletId(const TableName& table_name) { - TabletId tablet_id_to_split; - for (size_t i = 0; i < cluster_->num_tablet_servers(); ++i) { - const auto ts = cluster_->tablet_server(i); - const auto tablets = VERIFY_RESULT(cluster_->GetTablets(ts)); - for (const auto& tablet : tablets) { - if (tablet.table_name() == table_name) { - return tablet.tablet_id(); - } - } - } - return STATUS(NotFound, Format("No tablet found for table $0.", table_name)); - } + Result GetSingleTabletId(const TableName& table_name); protected: + virtual void SetUpOptions(ExternalMiniClusterOptions& opts); + void StartCluster(const std::vector& extra_ts_flags = std::vector(), const std::vector& extra_master_flags = std::vector(), int num_tablet_servers = 3, int num_masters = 1, bool enable_ysql = false); + Status StartCluster(ExternalMiniClusterOptions opts); + std::unique_ptr cluster_; std::unique_ptr inspect_; std::unique_ptr client_; itest::TabletServerMap ts_map_; }; -void ExternalMiniClusterITestBase::StartCluster(const std::vector& extra_ts_flags, - const std::vector& extra_master_flags, - int num_tablet_servers, - int num_masters, - bool enable_ysql) { - ExternalMiniClusterOptions opts; - opts.num_masters = num_masters; - opts.num_tablet_servers = num_tablet_servers; - opts.extra_master_flags = extra_master_flags; - opts.extra_tserver_flags = extra_ts_flags; - opts.enable_ysql = enable_ysql; - SetUpCluster(&opts); - - cluster_.reset(new ExternalMiniCluster(opts)); - ASSERT_OK(cluster_->Start()); - inspect_.reset(new itest::ExternalMiniClusterFsInspector(cluster_.get())); - ts_map_ = ASSERT_RESULT(itest::CreateTabletServerMap(cluster_.get())); - client_ = ASSERT_RESULT(cluster_->CreateClient()); -} - } // namespace yb diff --git a/src/yb/integration-tests/external_mini_cluster.cc b/src/yb/integration-tests/external_mini_cluster.cc index 5b98a7d18204..8daa78439e5a 100644 --- a/src/yb/integration-tests/external_mini_cluster.cc +++ b/src/yb/integration-tests/external_mini_cluster.cc @@ -310,6 +310,10 @@ Status ExternalMiniCluster::DeduceBinRoot(std::string* ret) { return Status::OK(); } +void ExternalMiniCluster::SetDaemonBinPath(const std::string& bin_path) { + daemon_bin_path_ = bin_path; +} + std::string ExternalMiniCluster::GetClusterDataDirName() const { if (opts_.cluster_id == "") { return "minicluster-data"; @@ -483,6 +487,14 @@ string ExternalMiniCluster::GetDataPath(const string& daemon_id) const { return JoinPathSegments(data_root_, daemon_id); } +std::string ExternalMiniCluster::GetMasterBinaryPath() const { + return GetBinaryPath(GetMasterBinaryName()); +} + +std::string ExternalMiniCluster::GetTServerBinaryPath() const { + return GetBinaryPath(GetTServerBinaryName()); +} + namespace { vector SubstituteInFlags(const vector& orig_flags, size_t index) { string str_index = std::to_string(index); diff --git a/src/yb/integration-tests/external_mini_cluster.h b/src/yb/integration-tests/external_mini_cluster.h index 3ab96b1769f9..15c1c0d029b5 100644 --- a/src/yb/integration-tests/external_mini_cluster.h +++ b/src/yb/integration-tests/external_mini_cluster.h @@ -567,6 +567,7 @@ class ExternalMiniCluster : public MiniClusterBase { MonoDelta timeout = MonoDelta::kMin); protected: + friend class UpgradeTestBase; FRIEND_TEST(MasterFailoverTest, TestKillAnyMaster); void ConfigureClientBuilder(client::YBClientBuilder* builder) override; @@ -578,7 +579,6 @@ class ExternalMiniCluster : public MiniClusterBase { std::string GetBinaryPath(const std::string& binary) const; std::string GetDataPath(const std::string& daemon_id) const; - Status DeduceBinRoot(std::string* ret); Status HandleOptions(); std::string GetClusterDataDirName() const; @@ -590,8 +590,8 @@ class ExternalMiniCluster : public MiniClusterBase { Status AddMaster(const ExternalMasterPtr& master); Status RemoveMaster(const ExternalMasterPtr& master); - // Get the index of this master in the vector of masters. This might not be the insertion order as - // we might have removed some masters within the vector. + // Get the index of this master in the vector of masters. This might not be the insertion order + // as we might have removed some masters within the vector. int GetIndexOfMaster(const ExternalMaster* master) const; // Checks that the masters_ list and opts_ match in terms of the number of elements. @@ -599,14 +599,18 @@ class ExternalMiniCluster : public MiniClusterBase { // Return the list of opid's for all master's in this cluster. Status GetLastOpIdForMasterPeers( - const MonoDelta& timeout, - consensus::OpIdType opid_type, - std::vector* op_ids, + const MonoDelta& timeout, consensus::OpIdType opid_type, std::vector* op_ids, const std::vector& masters); // Return master address for specified port. std::string MasterAddressForPort(uint16_t port) const; + Status DeduceBinRoot(std::string* ret); + std::string GetDaemonBinPath() const { return daemon_bin_path_; } + void SetDaemonBinPath(const std::string& bin_path); + std::string GetMasterBinaryPath() const; + std::string GetTServerBinaryPath() const; + ExternalMiniClusterOptions opts_; // The root for binaries. @@ -614,12 +618,13 @@ class ExternalMiniCluster : public MiniClusterBase { std::string data_root_; - // This variable is incremented every time a new master is spawned (either in shell mode or create - // mode). Avoids reusing an index of a killed/removed master. Useful for master side logging. + // This variable is incremented every time a new master is spawned (either in shell mode or + // create mode). Avoids reusing an index of a killed/removed master. Useful for master side + // logging. size_t add_new_master_at_ = 0; - std::vector > masters_; - std::vector > tablet_servers_; + std::vector> masters_; + std::vector> tablet_servers_; std::vector> yb_controller_servers_; diff --git a/src/yb/integration-tests/log_version-test.cc b/src/yb/integration-tests/log_version-test.cc index f965fd78c26b..0a510eff5d92 100644 --- a/src/yb/integration-tests/log_version-test.cc +++ b/src/yb/integration-tests/log_version-test.cc @@ -69,9 +69,9 @@ class LogHeader { class LogRollingTest : public ExternalMiniClusterITestBase { public: - void SetUpCluster(ExternalMiniClusterOptions* opts) override { - ExternalMiniClusterITestBase::SetUpCluster(opts); - opts->log_to_file = true; + void SetUpOptions(ExternalMiniClusterOptions& opts) override { + ExternalMiniClusterITestBase::SetUpOptions(opts); + opts.log_to_file = true; } }; diff --git a/src/yb/integration-tests/upgrade-tests/basic_upgrade-test.cc b/src/yb/integration-tests/upgrade-tests/basic_upgrade-test.cc new file mode 100644 index 000000000000..96d0788cb5f5 --- /dev/null +++ b/src/yb/integration-tests/upgrade-tests/basic_upgrade-test.cc @@ -0,0 +1,262 @@ +// Copyright (c) YugabyteDB, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations +// under the License. +// + +#include "yb/integration-tests/upgrade-tests/upgrade_test_base.h" + +#include "yb/yql/pgwrapper/libpq_utils.h" + +namespace yb { + +// Test upgrade and rollback with a simple workload with updates and selects. +class BasicUpgradeTest : public UpgradeTestBase { + public: + explicit BasicUpgradeTest(const std::string& from_version) : UpgradeTestBase(from_version) {} + virtual ~BasicUpgradeTest() = default; + + Status VerifyVersionFromDB(const std::string& expected_version) { + auto conn = VERIFY_RESULT(cluster_->ConnectToDB()); + auto version = VERIFY_RESULT(conn.FetchRowAsString("SELECT version()")); + LOG(INFO) << "Running version: " << version; + + SCHECK_STR_CONTAINS(version, expected_version); + return Status::OK(); + } + + Status TestUpgrade() { + RETURN_NOT_OK(StartClusterInOldVersion()); + RETURN_NOT_OK(VerifyVersionFromDB(old_version_info().version)); + + RETURN_NOT_OK(PrepareTableAndStartWorkload()); + + allow_errors_ = true; + RETURN_NOT_OK(UpgradeClusterToCurrentVersion()); + allow_errors_ = false; + + RETURN_NOT_OK(VerifyVersionFromDB(current_version_info().version_number())); + RETURN_NOT_OK(StopWorkloadAndCheckResults()); + + return Status::OK(); + } + + Status TestRollback() { + RETURN_NOT_OK(StartClusterInOldVersion()); + RETURN_NOT_OK(VerifyVersionFromDB(old_version_info().version)); + + RETURN_NOT_OK(PrepareTableAndStartWorkload()); + + const auto delay_between_nodes = 3s; + + allow_errors_ = true; + RETURN_NOT_OK(UpgradeClusterToCurrentVersion(delay_between_nodes, /*auto_finalize=*/false)); + + RETURN_NOT_OK(VerifyVersionFromDB(current_version_info().version_number())); + + RETURN_NOT_OK(RollbackClusterToOldVersion(delay_between_nodes)); + allow_errors_ = false; + + RETURN_NOT_OK(VerifyVersionFromDB(old_version_info().version)); + RETURN_NOT_OK(StopWorkloadAndCheckResults()); + + return Status::OK(); + } + + static constexpr auto kAccountBalanceTable = "account_balance"; + static const int kNumUsers = 3; + Status PrepareTableAndStartWorkload() { + auto conn = VERIFY_RESULT(cluster_->ConnectToDB()); + RETURN_NOT_OK(conn.ExecuteFormat( + "CREATE TABLE $0(id INT, name TEXT, salary INT, PRIMARY KEY(id));", kAccountBalanceTable)); + + for (int i = 0; i < kNumUsers; i++) { + RETURN_NOT_OK(conn.ExecuteFormat( + "INSERT INTO $0 VALUES($1, 'user$1', 1000000)", kAccountBalanceTable, i)); + } + + LOG(INFO) << "Initial data inserted: "; + RETURN_NOT_OK(PrintAccountBalanceTable()); + + test_thread_holder_.AddThread([this]() { + update_status_ = RunUpdateAccountBalanceWorkload(keep_running_, allow_errors_); + }); + test_thread_holder_.AddThread( + [this]() { scan_status_ = RunScanAccountBalanceWorkload(keep_running_, allow_errors_); }); + + // Wait for a few runs. + SleepFor(3s); + + return Status::OK(); + } + + Status StopWorkloadAndCheckResults() { + // Wait for a few extra runs. + SleepFor(10s); + + keep_running_ = false; + test_thread_holder_.JoinAll(); + RETURN_NOT_OK_PREPEND(update_status_, "Failed the update workload"); + RETURN_NOT_OK_PREPEND(scan_status_, "Failed the scan workload"); + return Status::OK(); + } + + Status PrintAccountBalanceTable() { + const auto select_all_query = Format("SELECT * FROM $0", kAccountBalanceTable); + + auto conn = VERIFY_RESULT(cluster_->ConnectToDB()); + auto rows = VERIFY_RESULT((conn.FetchRows(select_all_query))); + std::stringstream result_string; + result_string << "Account balance table: "; + for (const auto& row : rows) { + result_string << std::endl + << std::get<0>(row) << ", " << std::get<1>(row) << ", " << std::get<2>(row); + } + LOG(INFO) << result_string.str(); + + return Status::OK(); + } + + // Setup the connection if needed. Return true if a valid connection is ready. + // If we failed to create a connection and allowed_errors is set then returns false, else returns + // bad Status. + Result TrySetupConn(std::unique_ptr& conn, bool allow_errors) { + if (conn) { + return true; + } + + auto try_create_conn = [&]() -> Result> { + return std::make_unique(VERIFY_RESULT(cluster_->ConnectToDB())); + }; + + auto conn_result = try_create_conn(); + if (conn_result.ok()) { + conn.swap(*conn_result); + return true; + } + + if (allow_errors) { + LOG(ERROR) << "Failed to create new connection: " << conn_result.status(); + + return false; + } + + return conn_result.status(); + } + + // Move 500 from each user except the last one to the last user. + Status RunUpdateAccountBalanceWorkload( + std::atomic& keep_running, std::atomic& allow_errors) { + std::ostringstream oss; + oss << "BEGIN TRANSACTION;"; + for (int i = 0; i < kNumUsers - 1; i++) { + oss << "UPDATE account_balance SET salary = salary - 500 WHERE name = 'user" << i << "';"; + } + oss << "UPDATE account_balance SET salary = salary + " << 500 * (kNumUsers - 1) + << " WHERE name = 'user" << kNumUsers - 1 << "';"; + oss << "COMMIT;"; + auto update_query = oss.str(); + + std::unique_ptr conn; + LOG(INFO) << "Running update workload in a loop: " << update_query; + + while (keep_running) { + SleepFor(100ms); + + if (!VERIFY_RESULT(TrySetupConn(conn, allow_errors))) { + continue; + } + + auto status = conn->Execute(update_query); + if (!status.ok()) { + LOG(WARNING) << "Failed to update: " << status; + if (!allow_errors) { + return status; + } + conn.reset(); + continue; + } + } + + return Status::OK(); + } + + // Make sure the total balance remains unchanged. + Status RunScanAccountBalanceWorkload( + std::atomic& keep_running, std::atomic& allow_errors) { + const auto select_salary_sum_query = Format("SELECT SUM(salary) FROM $0", kAccountBalanceTable); + int64_t total_salary = 0; + { + auto conn = VERIFY_RESULT(cluster_->ConnectToDB()); + total_salary = VERIFY_RESULT(conn.FetchRow(select_salary_sum_query)); + } + + LOG(INFO) << "Running consumer workload in a loop: " << select_salary_sum_query; + std::unique_ptr conn; + while (keep_running) { + SleepFor(100ms); + + if (!VERIFY_RESULT(TrySetupConn(conn, allow_errors))) { + continue; + } + + auto salary_result = conn->FetchRow(select_salary_sum_query); + if (!salary_result.ok()) { + LOG(WARNING) << "Failed to fetch salary sum: " << salary_result.status(); + if (!allow_errors) { + return salary_result.status(); + } + conn.reset(); + continue; + } + + if (total_salary != *salary_result) { + LOG(ERROR) << "Invalid data: "; + WARN_NOT_OK(PrintAccountBalanceTable(), "Failed to print account balance table"); + return STATUS_FORMAT( + IllegalState, "Expected total $0, received total $1", total_salary, *salary_result); + } + } + + return Status::OK(); + } + + private: + Status update_status_, scan_status_; + std::atomic keep_running_{true}, allow_errors_{false}; +}; + +class TestBasicUpgradeFrom_2_20_2_4 : public BasicUpgradeTest { + public: + TestBasicUpgradeFrom_2_20_2_4() : BasicUpgradeTest(kBuild_2_20_2_4) {} +}; + +TEST_F_EX(BasicUpgradeTest, TestUpgradeFrom_2_20_2_4, TestBasicUpgradeFrom_2_20_2_4) { + ASSERT_OK(TestUpgrade()); +} + +TEST_F_EX(BasicUpgradeTest, TestRollbackTo_2_20_2_4, TestBasicUpgradeFrom_2_20_2_4) { + ASSERT_OK(TestRollback()); +} + +class TestBasicUpgradeFrom_2024_1_0_1 : public BasicUpgradeTest { + public: + TestBasicUpgradeFrom_2024_1_0_1() : BasicUpgradeTest(kBuild_2024_1_0_1) {} +}; + +TEST_F_EX(BasicUpgradeTest, TestUpgradeFrom_2024_1_0_1, TestBasicUpgradeFrom_2024_1_0_1) { + ASSERT_OK(TestUpgrade()); +} + +TEST_F_EX(BasicUpgradeTest, TestRollbackTo_2024_1_0_1, TestBasicUpgradeFrom_2024_1_0_1) { + ASSERT_OK(TestRollback()); +} + +} // namespace yb diff --git a/src/yb/integration-tests/upgrade-tests/builds.xml b/src/yb/integration-tests/upgrade-tests/builds.xml new file mode 100644 index 000000000000..85e0114ee3a6 --- /dev/null +++ b/src/yb/integration-tests/upgrade-tests/builds.xml @@ -0,0 +1,45 @@ + + + + + + b1 + + + + + + https://s3.us-west-2.amazonaws.com/uploads.dev.yugabyte.com/local-provider-test/2.20.2.4/yugabyte-2.20.2.4-1bb0a188802ed9d0b5c5b45b0cc58a50bdaeef84-debug-clang-darwin-arm64.tar.gz + + + https://s3.us-west-2.amazonaws.com/uploads.dev.yugabyte.com/local-provider-test/2.20.2.4/"yugabyte-2.20.2.4-5e022cea206152dbec17575f15d8fffc47fa2646-release-clang-darwin-arm64.tar.gz + + + + b1 + + https://s3.us-west-2.amazonaws.com/uploads.dev.yugabyte.com/local-provider-test/2024.1.0.1/yugabyte-2024.1.0.1-1b2019b3f075670a97989ceaf73ac45438b8948b-debug-clang17-centos-x86_64.tar.gz + + + https://s3.us-west-2.amazonaws.com/uploads.dev.yugabyte.com/local-provider-test/2024.1.0.1/yugabyte-2024.1.0.1-2079437eba766ae7c9c3fc390031c6bae9d6d99e-release-clang17-centos-x86_64.tar.gz + + + https://s3.us-west-2.amazonaws.com/uploads.dev.yugabyte.com/local-provider-test/2024.1.0.1/yugabyte-2024.1.0.1-cee7ae8183c225af8e34d906ecb50a403d4237a2-debug-clang-darwin-arm64.tar.gz + + + https://s3.us-west-2.amazonaws.com/uploads.dev.yugabyte.com/local-provider-test/2024.1.0.1/yugabyte-2024.1.0.1-2079437eba766ae7c9c3fc390031c6bae9d6d99e-release-clang-darwin-arm64.tar.gz + + + diff --git a/src/yb/integration-tests/upgrade-tests/upgrade_test_base.cc b/src/yb/integration-tests/upgrade-tests/upgrade_test_base.cc new file mode 100644 index 000000000000..6db6a07fe27a --- /dev/null +++ b/src/yb/integration-tests/upgrade-tests/upgrade_test_base.cc @@ -0,0 +1,496 @@ +// Copyright (c) YugabyteDB, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations +// under the License. +// + +#include "yb/integration-tests/upgrade-tests/upgrade_test_base.h" + +#include +#include +#include + +#include + +#include "yb/util/debug.h" +#include "yb/util/env_util.h" +#include "yb/util/scope_exit.h" +#include "yb/util/version_info.h" +#include "yb/yql/pgwrapper/libpq_utils.h" + +DECLARE_uint32(auto_flags_apply_delay_ms); + +namespace yb { + +namespace { + +const MonoDelta kTimeout = 20s * kTimeMultiplier; + +// Returns the URL for the current build type and os platform. Returns empty string if a valid URL +// does not exist. +std::string GetRelevantUrl(const BuildInfo& info) { +#if defined(__APPLE__) && defined(__aarch64__) + return kIsDebug ? info.darwin_debug_arm64_url : info.darwin_release_arm64_url; +#elif defined(__linux__) && defined(__x86_64__) + return kIsDebug ? info.linux_debug_x86_url : info.linux_release_x86_url; +#endif + return ""; +} + +Status RunCommand(const std::vector& args) { + std::string output, error; + LOG(INFO) << "Execute: " << AsString(args); + auto status = Subprocess::Call(args, &output, &error); + if (!status.ok()) { + return status.CloneAndAppend(error).CloneAndPrepend( + Format("Error running command $0: " + args.front())); + } + LOG(INFO) << "Command Output: " << output; + return Status::OK(); +} + +// Get the value of the key from the xml node as a string, and trims the value. +template +std::string GetXmlPathAsString(const T& node, const std::string& key) { + auto value = node.template get(key); + boost::trim(value); + return value; +} + +// Gets the build info for the given version from the builds.xml file. +Result GetBuildInfoForVersion(const std::string& version) { + const auto sub_dir = "upgrade_test_builds"; + const auto build_file_xml = + JoinPathSegments(env_util::GetRootDir(sub_dir), sub_dir, "builds.xml"); + + LOG(INFO) << "Reading build info from " << build_file_xml; + + try { + boost::property_tree::ptree pt; + boost::property_tree::xml_parser::read_xml(build_file_xml, pt); + for (const auto& [_, node] : pt.get_child("builds")) { + if (GetXmlPathAsString(node, ".version") == version) { + BuildInfo build_info; + build_info.version = version; + build_info.build_number = GetXmlPathAsString(node, "build_number"); + build_info.linux_debug_x86_url = GetXmlPathAsString(node, "linux_debug_x86"); + build_info.linux_release_x86_url = GetXmlPathAsString(node, "linux_release_x86"); + build_info.darwin_debug_arm64_url = GetXmlPathAsString(node, "darwin_debug_arm64"); + build_info.darwin_release_arm64_url = GetXmlPathAsString(node, "darwin_release_arm64"); + return build_info; + } + } + } catch (const std::exception& e) { + return STATUS_FORMAT(NotFound, "Failed to parse build file $0: $1", build_file_xml, e.what()); + } + + return STATUS_FORMAT( + NotFound, "Build info for version $0 not found in $1", version, build_file_xml); +} + +// Download and extract the old version if it does not already exist, and return the old version bin +// path. A ready.txt file is places in the bin directory to indicate that the old version is ready +// for use. +Result DownloadAndGetBinPath(const BuildInfo& build_info) { + std::string arch = "linux"; + std::string tar_bin = "tar"; +#ifdef __APPLE__ + arch = "darwin"; + tar_bin = "gtar"; +#endif + arch += kIsDebug ? "_debug" : "_release"; + + auto env = Env::Default(); + const std::string build_root = + JoinPathSegments(DirName(env_util::GetRootDir("bin")), "db-upgrade"); + RETURN_NOT_OK(env_util::CreateDirIfMissing(env, build_root)); + const auto version_root_path = JoinPathSegments( + build_root, Format("yugabyte_$0-$1_$2", build_info.version, build_info.build_number, arch)); + RETURN_NOT_OK(env_util::CreateDirIfMissing(env, version_root_path)); + + // Get a lock on a file since multiple tests can be running in parallel and downloading the same + // build to the same location. + const auto lock_file = JoinPathSegments(version_root_path, "lock.lck"); + FileLock* f_lock = nullptr; + MonoTime start = MonoTime::Now(); + do { + auto s = env->LockFile(lock_file, &f_lock, /*recursive_lock_ok=*/false); + if (s.ok()) { + break; + } + + SCHECK_LT( + MonoTime::Now() - start, 5min, IllegalState, + Format("Failed to acquire lock on ready file $0", lock_file)); + SleepFor(100ms); + } while (true); + auto se = ScopeExit([f_lock, &env] { CHECK_OK(env->UnlockFile(f_lock)); }); + + const auto ready_file = JoinPathSegments(version_root_path, "ready.txt"); + const auto extract_path = + JoinPathSegments(version_root_path, Format("yugabyte-$0", build_info.version)); + const auto bin_path = JoinPathSegments(extract_path, "bin"); + if (env->FileExists(ready_file)) { + LOG(INFO) << bin_path << " already downloaded and ready for use"; + return bin_path; + } + + const auto download_url = GetRelevantUrl(build_info); + const auto tar_file_name = BaseName(download_url); + + const std::string kDownloadDir = "/opt/yb-build/db-upgrade"; + const auto tar_file_path = JoinPathSegments(kDownloadDir, tar_file_name); + + if (!env->FileExists(tar_file_path)) { + RETURN_NOT_OK(env_util::CreateDirIfMissing(env, kDownloadDir)); + LOG(INFO) << "Downloading " << download_url << " to " << tar_file_path; + RETURN_NOT_OK(RunCommand( + {"curl", "--retry", "3", "--retry-delay", "3", download_url, "-o", tar_file_path})); + } + + LOG(INFO) << "Extracting " << tar_file_path << " to " << version_root_path; + if (env->DirExists(extract_path)) { + RETURN_NOT_OK(env->DeleteRecursively(extract_path)); + } + RETURN_NOT_OK(env->CreateDir(extract_path)); + RETURN_NOT_OK( + RunCommand({tar_bin, "xzf", tar_file_path, "--skip-old-files", "-C", version_root_path})); + +#if defined(__linux__) + RETURN_NOT_OK(RunCommand({"bash", JoinPathSegments(bin_path, "post_install.sh")})); +#endif + + RETURN_NOT_OK(WriteStringToFileSync(env, MonoTime::Now().ToFormattedString(), ready_file)); + + return bin_path; +} + +template +Status RestartDaemonInVersion(T& daemon, const std::string& bin_path) { + daemon.Shutdown(); + daemon.SetExe(bin_path); + return daemon.Restart(); +} + +// Add the flag_name to undefok list, so that it can be set on all versions even if the version does +// not contain the flag. If the flag_list already contains an undefok flag, append to it, else +// insert a new entry. +void AddUnDefOkFlag(std::vector& flag_list, const std::string& flag_name) { + for (auto& flag : flag_list) { + if (flag.find("--undefok=")) { + flag += Format(",$0", flag_name); + return; + } + } + flag_list.push_back(Format("--undefok=$0", flag_name)); +} + +void WaitForAutoFlagApply() { SleepFor(FLAGS_auto_flags_apply_delay_ms * 1ms + 3s); } + +} // namespace + +UpgradeTestBase::UpgradeTestBase(const std::string& from_version) + : old_version_info_(CHECK_RESULT(GetBuildInfoForVersion(from_version))) { + LOG(INFO) << "Old version: " << old_version_info_.version << ": " + << GetRelevantUrl(old_version_info_); +} + +void UpgradeTestBase::SetUp() { + if (IsSanitizer()) { + GTEST_SKIP() << "Upgrade testing not supported with sanitizers"; + } + + if (old_version_info_.version.empty()) { + CHECK(false) << "Build info for old version not set"; + return; + } + + if (GetRelevantUrl(old_version_info_).empty()) { + GTEST_SKIP() << "Upgrade testing not supported from version " << old_version_info_.version + << " for this OS architecture and build type"; + } + + ExternalMiniClusterITestBase::SetUp(); + + VersionInfo::GetVersionInfoPB(¤t_version_info_); + LOG(INFO) << "Current version: " << current_version_info_.DebugString(); + ASSERT_NE(old_version_info_.version, current_version_info_.version_number()); +} + +Status UpgradeTestBase::StartClusterInOldVersion() { + ExternalMiniClusterOptions default_opts; + default_opts.num_masters = 3; + default_opts.num_tablet_servers = 3; + + return StartClusterInOldVersion(default_opts); +} + +Status UpgradeTestBase::StartClusterInOldVersion(const ExternalMiniClusterOptions& options) { + ExternalMiniClusterOptions opts = options; + opts.enable_ysql = true; + opts.daemon_bin_path = VERIFY_RESULT(DownloadAndGetBinPath(old_version_info_)); + + // Disable TEST_always_return_consensus_info_for_succeeded_rpc since it is not upgrade safe. + AddUnDefOkFlag(opts.extra_master_flags, "TEST_always_return_consensus_info_for_succeeded_rpc"); + opts.extra_master_flags.push_back("--TEST_always_return_consensus_info_for_succeeded_rpc=false"); + AddUnDefOkFlag(opts.extra_tserver_flags, "TEST_always_return_consensus_info_for_succeeded_rpc"); + opts.extra_tserver_flags.push_back("--TEST_always_return_consensus_info_for_succeeded_rpc=false"); + + LOG(INFO) << "Starting cluster in version: " << old_version_info_.version; + + RETURN_NOT_OK(ExternalMiniClusterITestBase::StartCluster(opts)); + + old_version_bin_path_ = cluster_->GetDaemonBinPath(); + old_version_master_bin_path_ = cluster_->GetMasterBinaryPath(); + old_version_tserver_bin_path_ = cluster_->GetTServerBinaryPath(); + + RETURN_NOT_OK(cluster_->DeduceBinRoot(¤t_version_bin_path_)); + cluster_->SetDaemonBinPath(current_version_bin_path_); + current_version_master_bin_path_ = cluster_->GetMasterBinaryPath(); + current_version_tserver_bin_path_ = cluster_->GetTServerBinaryPath(); + cluster_->SetDaemonBinPath(old_version_bin_path_); + + return Status::OK(); +} + +Status UpgradeTestBase::UpgradeClusterToCurrentVersion( + MonoDelta delay_between_nodes, bool auto_finalize) { + LOG(INFO) << "Upgrading cluster to current version"; + + RETURN_NOT_OK_PREPEND( + RestartAllMastersInCurrentVersion(delay_between_nodes), "Failed to restart masters"); + + RETURN_NOT_OK_PREPEND( + RestartAllTServersInCurrentVersion(delay_between_nodes), "Failed to restart tservers"); + + RETURN_NOT_OK_PREPEND( + PromoteAutoFlags(AutoFlagClass::kLocalVolatile), "Failed to promote volatile AutoFlags"); + + if (!auto_finalize) { + return Status::OK(); + } + + return FinalizeUpgrade(); +} + +Status UpgradeTestBase::RestartAllMastersInCurrentVersion(MonoDelta delay_between_nodes) { + LOG(INFO) << "Restarting all yb-masters in current version"; + + for (auto* master : cluster_->master_daemons()) { + RETURN_NOT_OK(RestartMasterInCurrentVersion(*master, /*wait_for_cluster_to_stabilize=*/false)); + SleepFor(delay_between_nodes); + } + + RETURN_NOT_OK(WaitForClusterToStabilize()); + + return Status::OK(); +} + +Status UpgradeTestBase::RestartMasterInCurrentVersion( + ExternalMaster& master, bool wait_for_cluster_to_stabilize) { + LOG(INFO) << "Restarting yb-master " << master.id() << " in current version"; + RETURN_NOT_OK(RestartDaemonInVersion(master, current_version_master_bin_path_)); + + if (wait_for_cluster_to_stabilize) { + RETURN_NOT_OK(WaitForClusterToStabilize()); + } + + return Status::OK(); +} + +Status UpgradeTestBase::RestartAllTServersInCurrentVersion(MonoDelta delay_between_nodes) { + LOG(INFO) << "Restarting all yb-tservers in current version"; + + for (auto* tserver : cluster_->tserver_daemons()) { + RETURN_NOT_OK( + RestartTServerInCurrentVersion(*tserver, /*wait_for_cluster_to_stabilize=*/false)); + SleepFor(delay_between_nodes); + } + + RETURN_NOT_OK(WaitForClusterToStabilize()); + + return Status::OK(); +} + +Status UpgradeTestBase::RestartTServerInCurrentVersion( + ExternalTabletServer& ts, bool wait_for_cluster_to_stabilize) { + LOG(INFO) << "Restarting yb-tserver " << ts.id() << " in current version"; + RETURN_NOT_OK(RestartDaemonInVersion(ts, current_version_tserver_bin_path_)); + + if (wait_for_cluster_to_stabilize) { + RETURN_NOT_OK(WaitForClusterToStabilize()); + } + + return Status::OK(); +} + +Status UpgradeTestBase::PromoteAutoFlags(AutoFlagClass flag_class) { + LOG(INFO) << "Promoting AutoFlags " << flag_class; + + master::PromoteAutoFlagsRequestPB req; + master::PromoteAutoFlagsResponsePB resp; + rpc::RpcController rpc; + rpc.set_timeout(kTimeout); + req.set_max_flag_class(ToString(flag_class)); + req.set_promote_non_runtime_flags(false); + req.set_force(false); + RETURN_NOT_OK(cluster_->GetLeaderMasterProxy().PromoteAutoFlags( + req, &resp, &rpc)); + if (resp.has_error()) { + return StatusFromPB(resp.error().status()); + } + + WaitForAutoFlagApply(); + + LOG(INFO) << "Promoted AutoFlags: " << resp.DebugString(); + + if (flag_class == AutoFlagClass::kLocalVolatile) { + // Store the version info in case we want to rollback. + SCHECK(!auto_flags_rollback_version_, IllegalState, "Already promoted local volatile"); + if (resp.flags_promoted()) { + auto_flags_rollback_version_ = resp.new_config_version() - 1; + } + } else { + // Can no longer rollback volatile flags. + auto_flags_rollback_version_.reset(); + } + + return Status::OK(); +} + +Status UpgradeTestBase::PerformYsqlUpgrade() { + LOG(INFO) << "Running ysql upgrade"; + + tserver::UpgradeYsqlRequestPB req; + tserver::UpgradeYsqlResponsePB resp; + rpc::RpcController rpc; + rpc.set_timeout(2min * kTimeMultiplier); + + RETURN_NOT_OK(cluster_->GetTServerProxy(0).UpgradeYsql( + req, &resp, &rpc)); + + if (resp.has_error()) { + return StatusFromPB(resp.error().status()); + } + + return Status::OK(); +} + +Status UpgradeTestBase::FinalizeUpgrade() { + LOG(INFO) << "Finalizing upgrade"; + + RETURN_NOT_OK_PREPEND(PromoteAutoFlags(), "Failed to promote AutoFlags"); + + RETURN_NOT_OK_PREPEND(PerformYsqlUpgrade(), "Failed to perform ysql upgrade"); + + // Set the current version bin path for the cluster, so that any newly added nodes get started on + // the new version. + cluster_->SetDaemonBinPath(current_version_bin_path_); + + return Status::OK(); +} + +Status UpgradeTestBase::RollbackVolatileAutoFlags() { + if (!auto_flags_rollback_version_) { + return Status::OK(); + } + + LOG(INFO) << "Rolling back AutoFlags to version " << *auto_flags_rollback_version_; + + master::RollbackAutoFlagsRequestPB req; + master::RollbackAutoFlagsResponsePB resp; + rpc::RpcController rpc; + rpc.set_timeout(kTimeout); + req.set_rollback_version(*auto_flags_rollback_version_); + RETURN_NOT_OK(cluster_->GetLeaderMasterProxy().RollbackAutoFlags( + req, &resp, &rpc)); + if (resp.has_error()) { + return StatusFromPB(resp.error().status()); + } + auto_flags_rollback_version_.reset(); + + WaitForAutoFlagApply(); + + LOG(INFO) << "Rolled back AutoFlags: " << resp.DebugString(); + + return Status::OK(); +} + +Status UpgradeTestBase::RollbackClusterToOldVersion(MonoDelta delay_between_nodes) { + LOG(INFO) << "Rolling back upgrade"; + + RETURN_NOT_OK_PREPEND(RollbackVolatileAutoFlags(), "Failed to rollback Volatile AutoFlags"); + + RETURN_NOT_OK_PREPEND( + RestartAllTServersInOldVersion(delay_between_nodes), "Failed to restart tservers"); + + RETURN_NOT_OK_PREPEND( + RestartAllMastersInOldVersion(delay_between_nodes), "Failed to restart masters"); + + return Status::OK(); +} + +Status UpgradeTestBase::RestartAllMastersInOldVersion(MonoDelta delay_between_nodes) { + LOG(INFO) << "Restarting all yb-masters in old version"; + + for (auto* master : cluster_->master_daemons()) { + RETURN_NOT_OK(RestartMasterInOldVersion(*master, /*wait_for_cluster_to_stabilize=*/false)); + SleepFor(delay_between_nodes); + } + + RETURN_NOT_OK(WaitForClusterToStabilize()); + + return Status::OK(); +} + +Status UpgradeTestBase::RestartMasterInOldVersion( + ExternalMaster& master, bool wait_for_cluster_to_stabilize) { + LOG(INFO) << "Restarting yb-master " << master.id() << " in old version"; + RETURN_NOT_OK(RestartDaemonInVersion(master, old_version_master_bin_path_)); + + if (wait_for_cluster_to_stabilize) { + RETURN_NOT_OK(WaitForClusterToStabilize()); + } + + return Status::OK(); +} + +Status UpgradeTestBase::RestartAllTServersInOldVersion(MonoDelta delay_between_nodes) { + LOG(INFO) << "Restarting all yb-tservers in old version"; + + for (auto* tserver : cluster_->tserver_daemons()) { + RETURN_NOT_OK(RestartTServerInOldVersion(*tserver, /*wait_for_cluster_to_stabilize=*/false)); + SleepFor(delay_between_nodes); + } + + RETURN_NOT_OK(WaitForClusterToStabilize()); + + return Status::OK(); +} + +Status UpgradeTestBase::RestartTServerInOldVersion( + ExternalTabletServer& ts, bool wait_for_cluster_to_stabilize) { + LOG(INFO) << "Restarting yb-tserver " << ts.id() << " in old version"; + RETURN_NOT_OK(RestartDaemonInVersion(ts, old_version_tserver_bin_path_)); + + if (wait_for_cluster_to_stabilize) { + RETURN_NOT_OK(WaitForClusterToStabilize()); + } + + return Status::OK(); +} + +Status UpgradeTestBase::WaitForClusterToStabilize() { + RETURN_NOT_OK(cluster_->WaitForTabletServerCount(cluster_->num_tablet_servers(), kTimeout)); + + return Status::OK(); +} + +} // namespace yb diff --git a/src/yb/integration-tests/upgrade-tests/upgrade_test_base.h b/src/yb/integration-tests/upgrade-tests/upgrade_test_base.h new file mode 100644 index 000000000000..32d08a5a4a47 --- /dev/null +++ b/src/yb/integration-tests/upgrade-tests/upgrade_test_base.h @@ -0,0 +1,110 @@ +// Copyright (c) YugabyteDB, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations +// under the License. +// + +#pragma once + +#include "yb/integration-tests/external_mini_cluster-itest-base.h" +#include "yb/util/test_thread_holder.h" +#include "yb/util/version_info.pb.h" + +namespace yb { + +struct BuildInfo { + std::string version; + std::string build_number; + std::string linux_debug_x86_url; + std::string linux_release_x86_url; + std::string darwin_debug_arm64_url; + std::string darwin_release_arm64_url; +}; + +// Helper class to perform upgrades and rollback of Yugabyte DB. +// This test sets up a ExternalMini cluster on an older yb version and helps upgrade it to the +// current version, and rollback to the older version. +class UpgradeTestBase : public ExternalMiniClusterITestBase { + public: + explicit UpgradeTestBase(const std::string& from_version); + virtual ~UpgradeTestBase() = default; + + void SetUp() override; + + protected: + Status StartClusterInOldVersion(); + Status StartClusterInOldVersion(const ExternalMiniClusterOptions& options); + + Status UpgradeClusterToCurrentVersion( + MonoDelta delay_between_nodes = 3s, bool auto_finalize = true); + + Status RestartAllMastersInCurrentVersion(MonoDelta delay_between_nodes = 3s); + Status RestartMasterInCurrentVersion( + ExternalMaster& master, bool wait_for_cluster_to_stabilize = true); + Status RestartAllTServersInCurrentVersion(MonoDelta delay_between_nodes = 3s); + Status RestartTServerInCurrentVersion( + ExternalTabletServer& ts, bool wait_for_cluster_to_stabilize = true); + + Status FinalizeUpgrade(); + + Status PromoteAutoFlags(AutoFlagClass flag_class = AutoFlagClass::kExternal); + + Status PerformYsqlUpgrade(); + + Status RollbackClusterToOldVersion(MonoDelta delay_between_nodes = 3s); + + Status RestartAllMastersInOldVersion(MonoDelta delay_between_nodes = 3s); + Status RestartMasterInOldVersion( + ExternalMaster& master, bool wait_for_cluster_to_stabilize = true); + + Status RestartAllTServersInOldVersion(MonoDelta delay_between_nodes = 3s); + Status RestartTServerInOldVersion( + ExternalTabletServer& ts, bool wait_for_cluster_to_stabilize = true); + + Status RollbackVolatileAutoFlags(); + + // Wait for the cluster to stabilize after an upgrade or rollback. + // Waits for all tservers to register with the master leader, which happens after all tablets have + // been bootstrapped. + Status WaitForClusterToStabilize(); + + BuildInfo old_version_info() const { return old_version_info_; } + // Can only be used after SetUp has been called. + VersionInfoPB current_version_info() const { return current_version_info_; } + TestThreadHolder test_thread_holder_; + + private: + const BuildInfo old_version_info_; + VersionInfoPB current_version_info_; + + std::string old_version_bin_path_, current_version_bin_path_; + std::string old_version_master_bin_path_, current_version_master_bin_path_; + std::string old_version_tserver_bin_path_, current_version_tserver_bin_path_; + + std::optional auto_flags_rollback_version_; +}; + +// Supported builds +static constexpr auto kBuild_2_20_2_4 = "2.20.2.4"; +static constexpr auto kBuild_2024_1_0_1 = "2024.1.0.1"; + +// Helper classes for specific versions +class TestUpgradeFrom_2_20_2_4 : public UpgradeTestBase { + public: + TestUpgradeFrom_2_20_2_4() : UpgradeTestBase(kBuild_2_20_2_4) {} + virtual ~TestUpgradeFrom_2_20_2_4() = default; +}; + +class TestUpgradeFrom_2024_1_0_1 : public UpgradeTestBase { + public: + TestUpgradeFrom_2024_1_0_1() : UpgradeTestBase(kBuild_2024_1_0_1) {} +}; + +} // namespace yb diff --git a/src/yb/util/status_format.h b/src/yb/util/status_format.h index 1d5aa18cedef..81049347c37a 100644 --- a/src/yb/util/status_format.h +++ b/src/yb/util/status_format.h @@ -84,8 +84,12 @@ #define SCHECK_STR_CONTAINS(str, substr) \ SCHECK_NE( \ - str.find(substr), std::string::npos, NotFound, \ - Format("'$0' does not contain '$1'", str, substr)) + str.find((substr)), std::string::npos, NotFound, \ + Format("'$0' does not contain '$1'", str, (substr))) + +#define SCHECK_STR_NOT_CONTAINS(str, substr) \ + SCHECK_EQ( \ + str.find(substr), std::string::npos, IllegalState, Format("'$0' contain '$1'", str, substr)) #ifndef NDEBUG