diff --git a/pkg/ccl/kvccl/kvfollowerreadsccl/followerreads_test.go b/pkg/ccl/kvccl/kvfollowerreadsccl/followerreads_test.go index 13ae60113bf5..9ec0c056483e 100644 --- a/pkg/ccl/kvccl/kvfollowerreadsccl/followerreads_test.go +++ b/pkg/ccl/kvccl/kvfollowerreadsccl/followerreads_test.go @@ -833,6 +833,7 @@ func TestFollowerReadsWithStaleDescriptor(t *testing.T) { // where it needs to be estimated using node localities. func TestSecondaryTenantFollowerReadsRouting(t *testing.T) { defer leaktest.AfterTest(t)() + skip.WithIssue(t, 95338, "flaky test") defer utilccl.TestingEnableEnterprise()() skip.UnderStressRace(t, "times out") diff --git a/pkg/cli/debug_recover_loss_of_quorum_test.go b/pkg/cli/debug_recover_loss_of_quorum_test.go index 7fff194fda21..cae1711cf567 100644 --- a/pkg/cli/debug_recover_loss_of_quorum_test.go +++ b/pkg/cli/debug_recover_loss_of_quorum_test.go @@ -21,9 +21,11 @@ import ( "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv" "github.com/cockroachdb/cockroach/pkg/kv/kvserver" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/loqrecovery/loqrecoverypb" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/rpc" + "github.com/cockroachdb/cockroach/pkg/server" "github.com/cockroachdb/cockroach/pkg/server/serverpb" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/testutils/skip" @@ -329,6 +331,192 @@ func createIntentOnRangeDescriptor( } } +func TestHalfOnlineLossOfQuorumRecovery(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + skip.UnderDeadlock(t, "slow under deadlock") + + ctx := context.Background() + dir, cleanupFn := testutils.TempDir(t) + defer cleanupFn() + + c := NewCLITest(TestCLIParams{ + NoServer: true, + }) + defer c.Cleanup() + + listenerReg := testutils.NewListenerRegistry() + defer listenerReg.Close() + + storeReg := server.NewStickyInMemEnginesRegistry() + defer storeReg.CloseAllStickyInMemEngines() + + // Test cluster contains 3 nodes that we would turn into a single node + // cluster using loss of quorum recovery. To do that, we will terminate + // two nodes and run recovery on remaining one. Restarting node should + // bring it back to healthy (but underreplicated) state. + // TODO(oleg): Make test run with 7 nodes to exercise cases where multiple + // replicas survive. Current startup and allocator behaviour would make + // this test flaky. + tc := testcluster.NewTestCluster(t, 3, base.TestClusterArgs{ + ReusableListeners: true, + ServerArgs: base.TestServerArgs{ + StoreSpecs: []base.StoreSpec{ + {InMemory: true}, + }, + }, + ServerArgsPerNode: map[int]base.TestServerArgs{ + 0: { + Knobs: base.TestingKnobs{ + Server: &server.TestingKnobs{ + StickyEngineRegistry: storeReg, + }, + }, + Listener: listenerReg.GetOrFail(t, 0), + StoreSpecs: []base.StoreSpec{ + {InMemory: true, StickyInMemoryEngineID: "1"}, + }, + }, + }, + }) + tc.Start(t) + s := sqlutils.MakeSQLRunner(tc.Conns[0]) + s.Exec(t, "set cluster setting cluster.organization='remove dead replicas test'") + defer tc.Stopper().Stop(ctx) + + // We use scratch range to test special case for pending update on the + // descriptor which has to be cleaned up before recovery could proceed. + // For that we'll ensure it is not empty and then put an intent. After + // recovery, we'll check that the range is still accessible for writes as + // normal. + sk := tc.ScratchRange(t) + require.NoError(t, + tc.Server(0).DB().Put(ctx, testutils.MakeKey(sk, []byte{1}), "value"), + "failed to write value to scratch range") + + createIntentOnRangeDescriptor(ctx, t, tc, sk) + + node1ID := tc.Servers[0].NodeID() + + // Now that stores are prepared and replicated we can shut down cluster + // and perform store manipulations. + tc.StopServer(1) + tc.StopServer(2) + + // Generate recovery plan and try to verify that plan file was generated and contains + // meaningful data. This is not strictly necessary for verifying end-to-end flow, but + // having assertions on generated data helps to identify which stage of pipeline broke + // if test fails. + planFile := dir + "/recovery-plan.json" + out, err := c.RunWithCaptureArgs( + []string{ + "debug", + "recover", + "make-plan", + "--confirm=y", + "--certs-dir=test_certs", + "--host=" + tc.Server(0).ServingRPCAddr(), + "--plan=" + planFile, + }) + require.NoError(t, err, "failed to run make-plan") + require.Contains(t, out, fmt.Sprintf("- node n%d", node1ID), + "planner didn't provide correct apply instructions") + require.FileExists(t, planFile, "generated plan file") + planFileContent, err := os.ReadFile(planFile) + require.NoError(t, err, "test infra failed, can't open created plan file") + plan := loqrecoverypb.ReplicaUpdatePlan{} + jsonpb := protoutil.JSONPb{} + require.NoError(t, jsonpb.Unmarshal(planFileContent, &plan), + "failed to deserialize replica recovery plan") + require.NotEmpty(t, plan.Updates, "resulting plan contains no updates") + + out, err = c.RunWithCaptureArgs( + []string{ + "debug", "recover", "apply-plan", + "--certs-dir=test_certs", + "--host=" + tc.Server(0).ServingRPCAddr(), + "--confirm=y", planFile, + }) + require.NoError(t, err, "failed to run apply plan") + // Check that there were at least one mention of replica being promoted. + require.Contains(t, out, "updating replica", "no replica updates were recorded") + require.Contains(t, out, fmt.Sprintf("Plan staged. To complete recovery restart nodes n%d.", node1ID), + "apply plan failed to stage on expected nodes") + + // Verify plan is staged on nodes + out, err = c.RunWithCaptureArgs( + []string{ + "debug", "recover", "verify", + "--certs-dir=test_certs", + "--host=" + tc.Server(0).ServingRPCAddr(), + planFile, + }) + require.NoError(t, err, "failed to run verify plan") + require.Contains(t, out, "ERROR: loss of quorum recovery is not finished yet") + + tc.StopServer(0) + + // NB: If recovery is not performed, server will just hang on startup. + // This is caused by liveness range becoming unavailable and preventing any + // progress. So it is likely that test will timeout if basic workflow fails. + listenerReg.ReopenOrFail(t, 0) + require.NoError(t, tc.RestartServer(0), "restart failed") + s = sqlutils.MakeSQLRunner(tc.Conns[0]) + + // Verifying that post start cleanup performed node decommissioning that + // prevents old nodes from rejoining. + ac, err := tc.GetAdminClient(ctx, t, 0) + require.NoError(t, err, "failed to get admin client") + testutils.SucceedsSoon(t, func() error { + dr, err := ac.DecommissionStatus(ctx, &serverpb.DecommissionStatusRequest{NodeIDs: []roachpb.NodeID{2, 3}}) + if err != nil { + return err + } + for _, s := range dr.Status { + if s.Membership != livenesspb.MembershipStatus_DECOMMISSIONED { + return errors.Newf("expecting n%d to be decommissioned", s.NodeID) + } + } + return nil + }) + + // Validate that rangelog is updated by recovery records after cluster restarts. + testutils.SucceedsSoon(t, func() error { + r := s.QueryRow(t, + `select count(*) from system.rangelog where "eventType" = 'unsafe_quorum_recovery'`) + var recoveries int + r.Scan(&recoveries) + if recoveries != len(plan.Updates) { + return errors.Errorf("found %d recovery events while expecting %d", recoveries, + len(plan.Updates)) + } + return nil + }) + + // Verify recovery complete. + out, err = c.RunWithCaptureArgs( + []string{ + "debug", "recover", "verify", + "--certs-dir=test_certs", + "--host=" + tc.Server(0).ServingRPCAddr(), + planFile, + }) + require.NoError(t, err, "failed to run verify plan") + require.Contains(t, out, "Loss of quorum recovery is complete.") + + // We were using scratch range to test cleanup of pending transaction on + // rangedescriptor key. We want to verify that after recovery, range is still + // writable e.g. recovery succeeded. + require.NoError(t, + tc.Server(0).DB().Put(ctx, testutils.MakeKey(sk, []byte{1}), "value2"), + "failed to write value to scratch range after recovery") + + // Finally split scratch range to ensure metadata ranges are recovered. + _, _, err = tc.Server(0).SplitRange(testutils.MakeKey(sk, []byte{42})) + require.NoError(t, err, "failed to split range after recovery") +} + // TestJsonSerialization verifies that all fields serialized in JSON could be // read back. This specific test addresses issues where default naming scheme // may not work in combination with other tags correctly. e.g. repeated used diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index 2912588e41e6..07cbb937ef85 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -1306,10 +1306,13 @@ func (c *clusterImpl) FetchDebugZip(ctx context.Context, l *logger.Logger) error // // Ignore the files in the the log directory; we pull the logs separately anyway // so this would only cause duplication. - si := strconv.Itoa(i) - cmd := []string{"./cockroach", "debug", "zip", "--exclude-files='*.log,*.txt,*.pprof'", "--url", "{pgurl:" + si + "}", zipName} - if err := c.RunE(ctx, c.All(), cmd...); err != nil { - l.Printf("./cockroach debug zip failed: %v", err) + excludeFiles := "*.log,*.txt,*.pprof" + cmd := fmt.Sprintf( + "./cockroach debug zip --exclude-files='%s' --url {pgurl:%d} %s", + excludeFiles, i, zipName, + ) + if err := c.RunE(ctx, c.Node(i), cmd); err != nil { + l.Printf("./cockroach debug zip failed on node %d: %v", i, err) if i < c.spec.NodeCount { continue } @@ -1403,7 +1406,7 @@ func (c *clusterImpl) FetchDmesg(ctx context.Context, l *logger.Logger) error { } // Run dmesg on all nodes to redirect the kernel ring buffer content to a file. - cmd := []string{"/bin/bash", "-c", "'sudo dmesg > " + name + "'"} + cmd := []string{"/bin/bash", "-c", "'sudo dmesg -T > " + name + "'"} var results []install.RunResultDetails var combinedDmesgError error if results, combinedDmesgError = c.RunWithDetails(ctx, nil, c.All(), cmd...); combinedDmesgError != nil {