Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
96261: cli: loss of quorum recovery int-test for half online mode r=erikgrinaker a=aliher1911

This commit adds integration test for loss of quorum recovery in
half online mode. Its functionality largely mirrors behaviour
of offline quorum recovery test.

Release note: None

Fixes #93052

97118: roachtest: fix fetching of debug zip and add dmesg flag r=herkolategan a=renatolabs

**roachtest: fix fetching of debug zip**
The roachtest logic to fetch the debug zip after a failure iterates
over each node in the cluster spec for that test and attempts to run
the `debug zip` command, returning once it succeeds on any node. The
idea is that at that stage in the test, there's no way to know which
node is alive so every node is attempted.

However, the actual `Run` command executed used to use `c.All()`,
meaning that, in practice, it would fail if _any_ node failed to run
the `debug zip` command. One common scenario where this bug would come
to the surface is during tests that don't upload the `cockroach`
binary to every node, maybe because one of the nodes is there to run a
`./workload` command exclusively. In those cases, we would fail to
fetch the `debug.zip` just because the workload node didn't have the
cockroach binary (see #97100, for example).

This commit fixes the bug/typo by running the `./cockroach debug zip`
command on each node individually.

**roachtest: pass -T flag to dmesg**
By default, the timestamps displayed on the `dmesg` output are
relative to the kernel's boot time. This makes it harder to correlate
events in there with test events. This changes artifact collection to
run `dmesg -T` instead, which makes dmesg use human readable
timestamps.

97170: ccl/kvccl/kvfollowerreadsccl: skip TestSecondaryTenantFollowerReadsRouting r=arulajmani a=pavelkalinnikov

Refs: #95338

Reason: flaky test

Generated by bin/skip-test.

Release justification: non-production code changes

Epic: None
Release note: None

Co-authored-by: Oleg Afanasyev <[email protected]>
Co-authored-by: Renato Costa <[email protected]>
Co-authored-by: Pavel Kalinnikov <[email protected]>
  • Loading branch information
4 people committed Feb 15, 2023
4 parents a24b178 + 5640a18 + 763be60 + 30b2b3f commit 6764f27
Show file tree
Hide file tree
Showing 3 changed files with 197 additions and 5 deletions.
1 change: 1 addition & 0 deletions pkg/ccl/kvccl/kvfollowerreadsccl/followerreads_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -833,6 +833,7 @@ func TestFollowerReadsWithStaleDescriptor(t *testing.T) {
// where it needs to be estimated using node localities.
func TestSecondaryTenantFollowerReadsRouting(t *testing.T) {
defer leaktest.AfterTest(t)()
skip.WithIssue(t, 95338, "flaky test")
defer utilccl.TestingEnableEnterprise()()

skip.UnderStressRace(t, "times out")
Expand Down
188 changes: 188 additions & 0 deletions pkg/cli/debug_recover_loss_of_quorum_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,11 @@ import (
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/loqrecovery/loqrecoverypb"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/rpc"
"github.com/cockroachdb/cockroach/pkg/server"
"github.com/cockroachdb/cockroach/pkg/server/serverpb"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
Expand Down Expand Up @@ -329,6 +331,192 @@ func createIntentOnRangeDescriptor(
}
}

func TestHalfOnlineLossOfQuorumRecovery(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

skip.UnderDeadlock(t, "slow under deadlock")

ctx := context.Background()
dir, cleanupFn := testutils.TempDir(t)
defer cleanupFn()

c := NewCLITest(TestCLIParams{
NoServer: true,
})
defer c.Cleanup()

listenerReg := testutils.NewListenerRegistry()
defer listenerReg.Close()

storeReg := server.NewStickyInMemEnginesRegistry()
defer storeReg.CloseAllStickyInMemEngines()

// Test cluster contains 3 nodes that we would turn into a single node
// cluster using loss of quorum recovery. To do that, we will terminate
// two nodes and run recovery on remaining one. Restarting node should
// bring it back to healthy (but underreplicated) state.
// TODO(oleg): Make test run with 7 nodes to exercise cases where multiple
// replicas survive. Current startup and allocator behaviour would make
// this test flaky.
tc := testcluster.NewTestCluster(t, 3, base.TestClusterArgs{
ReusableListeners: true,
ServerArgs: base.TestServerArgs{
StoreSpecs: []base.StoreSpec{
{InMemory: true},
},
},
ServerArgsPerNode: map[int]base.TestServerArgs{
0: {
Knobs: base.TestingKnobs{
Server: &server.TestingKnobs{
StickyEngineRegistry: storeReg,
},
},
Listener: listenerReg.GetOrFail(t, 0),
StoreSpecs: []base.StoreSpec{
{InMemory: true, StickyInMemoryEngineID: "1"},
},
},
},
})
tc.Start(t)
s := sqlutils.MakeSQLRunner(tc.Conns[0])
s.Exec(t, "set cluster setting cluster.organization='remove dead replicas test'")
defer tc.Stopper().Stop(ctx)

// We use scratch range to test special case for pending update on the
// descriptor which has to be cleaned up before recovery could proceed.
// For that we'll ensure it is not empty and then put an intent. After
// recovery, we'll check that the range is still accessible for writes as
// normal.
sk := tc.ScratchRange(t)
require.NoError(t,
tc.Server(0).DB().Put(ctx, testutils.MakeKey(sk, []byte{1}), "value"),
"failed to write value to scratch range")

createIntentOnRangeDescriptor(ctx, t, tc, sk)

node1ID := tc.Servers[0].NodeID()

// Now that stores are prepared and replicated we can shut down cluster
// and perform store manipulations.
tc.StopServer(1)
tc.StopServer(2)

// Generate recovery plan and try to verify that plan file was generated and contains
// meaningful data. This is not strictly necessary for verifying end-to-end flow, but
// having assertions on generated data helps to identify which stage of pipeline broke
// if test fails.
planFile := dir + "/recovery-plan.json"
out, err := c.RunWithCaptureArgs(
[]string{
"debug",
"recover",
"make-plan",
"--confirm=y",
"--certs-dir=test_certs",
"--host=" + tc.Server(0).ServingRPCAddr(),
"--plan=" + planFile,
})
require.NoError(t, err, "failed to run make-plan")
require.Contains(t, out, fmt.Sprintf("- node n%d", node1ID),
"planner didn't provide correct apply instructions")
require.FileExists(t, planFile, "generated plan file")
planFileContent, err := os.ReadFile(planFile)
require.NoError(t, err, "test infra failed, can't open created plan file")
plan := loqrecoverypb.ReplicaUpdatePlan{}
jsonpb := protoutil.JSONPb{}
require.NoError(t, jsonpb.Unmarshal(planFileContent, &plan),
"failed to deserialize replica recovery plan")
require.NotEmpty(t, plan.Updates, "resulting plan contains no updates")

out, err = c.RunWithCaptureArgs(
[]string{
"debug", "recover", "apply-plan",
"--certs-dir=test_certs",
"--host=" + tc.Server(0).ServingRPCAddr(),
"--confirm=y", planFile,
})
require.NoError(t, err, "failed to run apply plan")
// Check that there were at least one mention of replica being promoted.
require.Contains(t, out, "updating replica", "no replica updates were recorded")
require.Contains(t, out, fmt.Sprintf("Plan staged. To complete recovery restart nodes n%d.", node1ID),
"apply plan failed to stage on expected nodes")

// Verify plan is staged on nodes
out, err = c.RunWithCaptureArgs(
[]string{
"debug", "recover", "verify",
"--certs-dir=test_certs",
"--host=" + tc.Server(0).ServingRPCAddr(),
planFile,
})
require.NoError(t, err, "failed to run verify plan")
require.Contains(t, out, "ERROR: loss of quorum recovery is not finished yet")

tc.StopServer(0)

// NB: If recovery is not performed, server will just hang on startup.
// This is caused by liveness range becoming unavailable and preventing any
// progress. So it is likely that test will timeout if basic workflow fails.
listenerReg.ReopenOrFail(t, 0)
require.NoError(t, tc.RestartServer(0), "restart failed")
s = sqlutils.MakeSQLRunner(tc.Conns[0])

// Verifying that post start cleanup performed node decommissioning that
// prevents old nodes from rejoining.
ac, err := tc.GetAdminClient(ctx, t, 0)
require.NoError(t, err, "failed to get admin client")
testutils.SucceedsSoon(t, func() error {
dr, err := ac.DecommissionStatus(ctx, &serverpb.DecommissionStatusRequest{NodeIDs: []roachpb.NodeID{2, 3}})
if err != nil {
return err
}
for _, s := range dr.Status {
if s.Membership != livenesspb.MembershipStatus_DECOMMISSIONED {
return errors.Newf("expecting n%d to be decommissioned", s.NodeID)
}
}
return nil
})

// Validate that rangelog is updated by recovery records after cluster restarts.
testutils.SucceedsSoon(t, func() error {
r := s.QueryRow(t,
`select count(*) from system.rangelog where "eventType" = 'unsafe_quorum_recovery'`)
var recoveries int
r.Scan(&recoveries)
if recoveries != len(plan.Updates) {
return errors.Errorf("found %d recovery events while expecting %d", recoveries,
len(plan.Updates))
}
return nil
})

// Verify recovery complete.
out, err = c.RunWithCaptureArgs(
[]string{
"debug", "recover", "verify",
"--certs-dir=test_certs",
"--host=" + tc.Server(0).ServingRPCAddr(),
planFile,
})
require.NoError(t, err, "failed to run verify plan")
require.Contains(t, out, "Loss of quorum recovery is complete.")

// We were using scratch range to test cleanup of pending transaction on
// rangedescriptor key. We want to verify that after recovery, range is still
// writable e.g. recovery succeeded.
require.NoError(t,
tc.Server(0).DB().Put(ctx, testutils.MakeKey(sk, []byte{1}), "value2"),
"failed to write value to scratch range after recovery")

// Finally split scratch range to ensure metadata ranges are recovered.
_, _, err = tc.Server(0).SplitRange(testutils.MakeKey(sk, []byte{42}))
require.NoError(t, err, "failed to split range after recovery")
}

// TestJsonSerialization verifies that all fields serialized in JSON could be
// read back. This specific test addresses issues where default naming scheme
// may not work in combination with other tags correctly. e.g. repeated used
Expand Down
13 changes: 8 additions & 5 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -1306,10 +1306,13 @@ func (c *clusterImpl) FetchDebugZip(ctx context.Context, l *logger.Logger) error
//
// Ignore the files in the the log directory; we pull the logs separately anyway
// so this would only cause duplication.
si := strconv.Itoa(i)
cmd := []string{"./cockroach", "debug", "zip", "--exclude-files='*.log,*.txt,*.pprof'", "--url", "{pgurl:" + si + "}", zipName}
if err := c.RunE(ctx, c.All(), cmd...); err != nil {
l.Printf("./cockroach debug zip failed: %v", err)
excludeFiles := "*.log,*.txt,*.pprof"
cmd := fmt.Sprintf(
"./cockroach debug zip --exclude-files='%s' --url {pgurl:%d} %s",
excludeFiles, i, zipName,
)
if err := c.RunE(ctx, c.Node(i), cmd); err != nil {
l.Printf("./cockroach debug zip failed on node %d: %v", i, err)
if i < c.spec.NodeCount {
continue
}
Expand Down Expand Up @@ -1403,7 +1406,7 @@ func (c *clusterImpl) FetchDmesg(ctx context.Context, l *logger.Logger) error {
}

// Run dmesg on all nodes to redirect the kernel ring buffer content to a file.
cmd := []string{"/bin/bash", "-c", "'sudo dmesg > " + name + "'"}
cmd := []string{"/bin/bash", "-c", "'sudo dmesg -T > " + name + "'"}
var results []install.RunResultDetails
var combinedDmesgError error
if results, combinedDmesgError = c.RunWithDetails(ctx, nil, c.All(), cmd...); combinedDmesgError != nil {
Expand Down

0 comments on commit 6764f27

Please sign in to comment.