From e24d8b076b6b6f8662514d083de7bc82e05cc092 Mon Sep 17 00:00:00 2001 From: Minghui Yang Date: Fri, 26 Jul 2024 01:03:54 +0000 Subject: [PATCH] [BACKPORT 2024.1][#13358] YSQL: Fix bug in partitioned table DDL atomicity stress test Summary: This diff adds a new type of DDL atomicity stress test for partitioned table into the existing DDL atomicity stress test suite. These new tests exposed a bug that caused them to be flaky (manifested as test timeout after 10 minutes). The bug happens when the DDL transaction is rolled back. In the DDL transaction verifier state, we have `verifier_state->tables` to represent all the tables involved in this transaction. Once we know that the transaction is aborted, we call `YsqlDdlTxnCompleteCallbackInternal` to process all the tables in verifier_state->tables. Then we set the verifier state to YsqlDdlVerificationState::kDdlPostProcessing to indicate that the transaction is finished. But it is possible that verifier_state->tables does not already have all the tables in the txn. For example, a txn involves three tables t1, t2, t3. After t1 and t2 are added to txn, the txn is aborted due to a conflict. In this case `YsqlDdlTxnCompleteCallbackInternal` is only called on t1 and t2 and the state is set to kDdlPostProcessing before t3 gets added. Later when t3 gets added. We currently returns Status::OK() if the state == kDdlPostProcessing, and t3 will never be processed, leading to test failure due to timeout. The fix is always process all the tables in verifier_state->tables. In the above example we re-process t1 and t2 (they will turn into no-op), and process t3. Jira: DB-2996 Original commit: baf343ff15ae9840625afc5e051d37ee3ec4875b / D36862 Test Plan: ./yb_build.sh debug --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityPartitionedTablesStressTest.BasicTest -n 20 --tp 1 ./yb_build.sh debug --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityPartitionedTablesStressTest.TestTxnVerificationFailure -n 20 --tp 1 ./yb_build.sh debug --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityPartitionedTablesStressTest.TestFailCatalogWrites -n 20 --tp 1 ./yb_build.sh debug --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityPartitionedTablesStressTest.TestFailDdlRollback -n 20 --tp 1 ./yb_build.sh debug --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityPartitionedTablesStressTest.TestFailDdlVerification -n 20 --tp 1 ./yb_build.sh debug --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest.BasicTest -n 20 --tp 1 ./yb_build.sh debug --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest.TestTxnVerificationFailure -n 20 --tp 1 ./yb_build.sh debug --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest.TestFailCatalogWrites -n 20 --tp 1 ./yb_build.sh debug --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest.TestFailDdlRollback -n 20 --tp 1 ./yb_build.sh debug --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityStressTest.TestFailDdlVerification -n 20 --tp 1 ./yb_build.sh debug --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityColocatedStressTest.BasicTest -n 20 --tp 1 ./yb_build.sh debug --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityColocatedStressTest.TestTxnVerificationFailure -n 20 --tp 1 ./yb_build.sh debug --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityColocatedStressTest.TestFailCatalogWrites -n 20 --tp 1 ./yb_build.sh debug --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityColocatedStressTest.TestFailDdlRollback -n 20 --tp 1 ./yb_build.sh debug --sj --cxx-test pg_ddl_atomicity_stress-test --gtest_filter=PgDdlAtomicityColocatedStressTest.TestFailDdlVerification -n 20 --tp 1 Reviewers: hsunder, fizaa Reviewed By: fizaa Subscribers: ybase, yql Tags: #jenkins-ready Differential Revision: https://phorge.dev.yugabyte.com/D36933 --- src/yb/master/ysql_ddl_handler.cc | 28 +++++++++--- .../pgwrapper/pg_ddl_atomicity_stress-test.cc | 45 +++++++++++++++++++ 2 files changed, 66 insertions(+), 7 deletions(-) diff --git a/src/yb/master/ysql_ddl_handler.cc b/src/yb/master/ysql_ddl_handler.cc index bdcb4519d9d2..4120485e9e9a 100644 --- a/src/yb/master/ysql_ddl_handler.cc +++ b/src/yb/master/ysql_ddl_handler.cc @@ -198,16 +198,30 @@ Status CatalogManager::YsqlDdlTxnCompleteCallback(const string& pb_txn_id, } auto state = verifier_state->state; + auto txn_state = is_committed ? TxnState::kCommitted : TxnState::kAborted; if (state == YsqlDdlVerificationState::kDdlPostProcessing) { - // Verification is already in progress. - VLOG(3) << "Transaction " << txn << " is already being verified, ignoring"; - return Status::OK(); + // We used to return Status::OK() here on the grounds that the txn is + // already being verified and we assumed verifier_state->tables represent + // all of the tables involved in txn and they are taken care of by + // calling YsqlDdlTxnCompleteCallbackInternal on each of them below. + // However, verifier_state->tables may not include all of the tables + // involved in txn. It is possible that a table is only added into the + // txn after the txn is already in kDdlPostProcessing state. For example, + // a txn involves three tables t1, t2, t3. After t1 and t2 are added to txn, + // the txn is aborted due to some reason (e.g., conflict). In this case + // YsqlDdlTxnCompleteCallbackInternal is only called on t1 and t2 and + // the state is set to kDdlPostProcessing before t3 gets added. Later when + // t3 gets added, if we return Status::OK() here, then t3 will never be + // processed. Therefore we need to call YsqlDdlTxnCompleteCallbackInternal + // to process t3. It is fine to reprocess t1 and t2, that will result in a + // no-op. + SCHECK_EQ(txn_state, verifier_state->txn_state, IllegalState, + Format("Mismatch in txn_state for transaction $0", txn)); + } else { + verifier_state->txn_state = txn_state; + verifier_state->state = YsqlDdlVerificationState::kDdlPostProcessing; } - tables = verifier_state->tables; - verifier_state->txn_state = - (is_committed) ? TxnState::kCommitted : TxnState::kAborted; - verifier_state->state = YsqlDdlVerificationState::kDdlPostProcessing; } bool ddl_verification_success = true; diff --git a/src/yb/yql/pgwrapper/pg_ddl_atomicity_stress-test.cc b/src/yb/yql/pgwrapper/pg_ddl_atomicity_stress-test.cc index 1e457b0421c4..edabeda9a5fc 100644 --- a/src/yb/yql/pgwrapper/pg_ddl_atomicity_stress-test.cc +++ b/src/yb/yql/pgwrapper/pg_ddl_atomicity_stress-test.cc @@ -366,5 +366,50 @@ TEST_F(PgDdlAtomicityColocatedStressTest, TestFailDdlVerification) { ASSERT_OK(StressTestWithFlag("TEST_ysql_ddl_verification_failure_probability")); } +/* + * Tests on Partitioned Tables. +*/ +class PgDdlAtomicityPartitionedTablesStressTest : public PgDdlAtomicityStressTest { + virtual int NumIterations() override { + // Fewer iterations are sufficient for partitioned table tests because each DDL statement + // internally invokes (num_partitions + 1) DDLs. + return 3; + } + virtual Status SetupTables() override; +}; + +Status PgDdlAtomicityPartitionedTablesStressTest::SetupTables() { + auto conn = VERIFY_RESULT(Connect()); + RETURN_NOT_OK(conn.ExecuteFormat("CREATE TABLE $0 (key INT PRIMARY KEY, value TEXT, num real) " + " PARTITION BY RANGE(key)", kTable)); + RETURN_NOT_OK(conn.ExecuteFormat( + "CREATE TABLE $0_$1 PARTITION OF $0 FOR VALUES FROM ($1) TO ($2)", + kTable, 1, NumIterations())); + // Create a default partition. + RETURN_NOT_OK(conn.ExecuteFormat("CREATE TABLE $0_default PARTITION OF $0 DEFAULT", kTable)); + return Status::OK(); +} + +TEST_F(PgDdlAtomicityPartitionedTablesStressTest, BasicTest) { + ASSERT_OK(StressTestWithFlag("")); +} + +TEST_F(PgDdlAtomicityPartitionedTablesStressTest, TestTxnVerificationFailure) { + ASSERT_OK(StressTestWithFlag("TEST_ysql_ddl_transaction_verification_failure_probability")); +} + +TEST_F(PgDdlAtomicityPartitionedTablesStressTest, TestFailCatalogWrites) { + ASSERT_OK(StressTestWithFlag( + "TEST_ysql_fail_probability_of_catalog_writes_by_ddl_verification")); +} + +TEST_F(PgDdlAtomicityPartitionedTablesStressTest, TestFailDdlRollback) { + ASSERT_OK(StressTestWithFlag("TEST_ysql_ddl_rollback_failure_probability")); +} + +TEST_F(PgDdlAtomicityPartitionedTablesStressTest, TestFailDdlVerification) { + ASSERT_OK(StressTestWithFlag("TEST_ysql_ddl_verification_failure_probability")); +} + } // namespace pgwrapper } // namespace yb