From a2a6f21d5c10c45f6c62d38d9a318f677b32dab4 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Sat, 13 May 2023 14:07:33 +0000 Subject: [PATCH 1/5] roachtest: improve zone config readability in `failover` tests This patch improves readability of zone configs in `failover` tests, by tweaking the APIs. It also fixes a bug which placed replicas on the wrong nodes in `failover/partial/lease-liveness`. Epic: none Release note: None --- pkg/cmd/roachtest/tests/failover.go | 103 ++++++++++++++++------------ 1 file changed, 59 insertions(+), 44 deletions(-) diff --git a/pkg/cmd/roachtest/tests/failover.go b/pkg/cmd/roachtest/tests/failover.go index 40f21fdaab03..f0ee4c5ce3cb 100644 --- a/pkg/cmd/roachtest/tests/failover.go +++ b/pkg/cmd/roachtest/tests/failover.go @@ -15,7 +15,6 @@ import ( gosql "database/sql" "fmt" "math/rand" - "strings" "time" "github.com/cockroachdb/cockroach/pkg/base" @@ -111,6 +110,7 @@ func runDisconnect(ctx context.Context, t test.Test, c cluster.Cluster, expLease require.Equal(t, 6, c.Spec().NodeCount) rng, _ := randutil.NewTestRand() + // Create cluster. opts := option.DefaultStartOpts() settings := install.MakeClusterSettings() @@ -125,15 +125,17 @@ func runDisconnect(ctx context.Context, t test.Test, c cluster.Cluster, expLease expLeases) require.NoError(t, err) - constrainAllConfig(t, ctx, conn, 3, []int{4, 5}, 0) - constrainConfig(t, ctx, conn, `RANGE liveness`, 3, []int{3, 5}, 4) + configureAllZones(t, ctx, conn, zoneConfig{replicas: 3, onlyNodes: []int{1, 2, 3}}) + configureZone(t, ctx, conn, `RANGE liveness`, zoneConfig{ + replicas: 3, onlyNodes: []int{1, 2, 4}, leaseNode: 4}) + // Wait for upreplication. require.NoError(t, WaitFor3XReplication(ctx, t, conn)) t.Status("creating workload database") _, err = conn.ExecContext(ctx, `CREATE DATABASE kv`) require.NoError(t, err) - constrainConfig(t, ctx, conn, `DATABASE kv`, 3, []int{2, 3, 5}, 0) + configureZone(t, ctx, conn, `DATABASE kv`, zoneConfig{replicas: 3, onlyNodes: []int{2, 3, 5}}) c.Run(ctx, c.Node(6), `./cockroach workload init kv --splits 100 {pgurl:1}`) @@ -246,7 +248,7 @@ func runFailoverNonSystem( require.NoError(t, err) // Constrain all existing zone configs to n1-n3. - constrainAllConfig(t, ctx, conn, 3, []int{4, 5, 6}, 0) + configureAllZones(t, ctx, conn, zoneConfig{replicas: 3, onlyNodes: []int{1, 2, 3}}) // Wait for upreplication. require.NoError(t, WaitFor3XReplication(ctx, t, conn)) @@ -256,7 +258,7 @@ func runFailoverNonSystem( t.Status("creating workload database") _, err = conn.ExecContext(ctx, `CREATE DATABASE kv`) require.NoError(t, err) - constrainConfig(t, ctx, conn, `DATABASE kv`, 3, []int{1, 2, 3}, 0) + configureZone(t, ctx, conn, `DATABASE kv`, zoneConfig{replicas: 3, onlyNodes: []int{4, 5, 6}}) c.Run(ctx, c.Node(7), `./cockroach workload init kv --splits 1000 {pgurl:1}`) // The replicate queue takes forever to move the kv ranges from n1-n3 to @@ -390,10 +392,10 @@ func runFailoverLiveness( require.NoError(t, err) // Constrain all existing zone configs to n1-n3. - constrainAllConfig(t, ctx, conn, 3, []int{4}, 0) + configureAllZones(t, ctx, conn, zoneConfig{replicas: 3, onlyNodes: []int{1, 2, 3}}) // Constrain the liveness range to n1-n4, with leaseholder preference on n4. - constrainConfig(t, ctx, conn, `RANGE liveness`, 4, nil, 4) + configureZone(t, ctx, conn, `RANGE liveness`, zoneConfig{replicas: 4, leaseNode: 4}) require.NoError(t, err) // Wait for upreplication. @@ -404,7 +406,7 @@ func runFailoverLiveness( t.Status("creating workload database") _, err = conn.ExecContext(ctx, `CREATE DATABASE kv`) require.NoError(t, err) - constrainConfig(t, ctx, conn, `DATABASE kv`, 3, []int{4}, 0) + configureZone(t, ctx, conn, `DATABASE kv`, zoneConfig{replicas: 3, onlyNodes: []int{1, 2, 3}}) c.Run(ctx, c.Node(5), `./cockroach workload init kv --splits 1000 {pgurl:1}`) // The replicate queue takes forever to move the other ranges off of n4 so we @@ -534,8 +536,8 @@ func runFailoverSystemNonLiveness( // Constrain all existing zone configs to n4-n6, except liveness which is // constrained to n1-n3. - constrainAllConfig(t, ctx, conn, 3, []int{1, 2, 3}, 0) - constrainConfig(t, ctx, conn, `RANGE liveness`, 3, []int{4, 5, 6}, 0) + configureAllZones(t, ctx, conn, zoneConfig{replicas: 3, onlyNodes: []int{4, 5, 6}}) + configureZone(t, ctx, conn, `RANGE liveness`, zoneConfig{replicas: 3, onlyNodes: []int{1, 2, 3}}) require.NoError(t, err) // Wait for upreplication. @@ -546,7 +548,7 @@ func runFailoverSystemNonLiveness( t.Status("creating workload database") _, err = conn.ExecContext(ctx, `CREATE DATABASE kv`) require.NoError(t, err) - constrainConfig(t, ctx, conn, `DATABASE kv`, 3, []int{4, 5, 6}, 0) + configureZone(t, ctx, conn, `DATABASE kv`, zoneConfig{replicas: 3, onlyNodes: []int{1, 2, 3}}) c.Run(ctx, c.Node(7), `./cockroach workload init kv --splits 1000 {pgurl:1}`) // The replicate queue takes forever to move the kv ranges from n4-n6 to @@ -945,51 +947,64 @@ func relocateLeases(t test.Test, ctx context.Context, conn *gosql.DB, predicate } } -// constrainConfig will alter the zone config for the target to specify the -// number of nodes the target can be on, the replicas it is prevented from being -// on and an optional leaseholder. -func constrainConfig( - t test.Test, - ctx context.Context, - conn *gosql.DB, - target string, - numNodes int, - constrainedReplicas []int, - lease int, +type zoneConfig struct { + replicas int + onlyNodes []int + leaseNode int +} + +// configureZone sets the zone config for the given target. +func configureZone( + t test.Test, ctx context.Context, conn *gosql.DB, target string, cfg zoneConfig, ) { - replica := make([]string, len(constrainedReplicas)) - for i, n := range constrainedReplicas { - replica[i] = fmt.Sprintf("-node%d", n) + require.NotZero(t, cfg.replicas, "num_replicas must be > 0") + + // If onlyNodes is given, invert the constraint and specify which nodes are + // prohibited. Otherwise, the allocator may leave replicas outside of the + // specified nodes. + var constraintsString string + if len(cfg.onlyNodes) > 0 { + nodeCount := t.Spec().(*registry.TestSpec).Cluster.NodeCount - 1 // subtract worker node + included := map[int]bool{} + for _, nodeID := range cfg.onlyNodes { + included[nodeID] = true + } + excluded := []int{} + for nodeID := 1; nodeID <= nodeCount; nodeID++ { + if !included[nodeID] { + excluded = append(excluded, nodeID) + } + } + for _, nodeID := range excluded { + if len(constraintsString) > 0 { + constraintsString += "," + } + constraintsString += fmt.Sprintf("-node%d", nodeID) + } } - replicaStr := fmt.Sprintf(`'[%s]'`, strings.Join(replica, ",")) - leaseStr := "" - if lease > 0 { - leaseStr = fmt.Sprintf(`[+node%d]`, lease) + var leaseString string + if cfg.leaseNode > 0 { + leaseString += fmt.Sprintf("[+node%d]", cfg.leaseNode) } - str := - fmt.Sprintf( - `ALTER %s CONFIGURE ZONE USING num_replicas = %d, constraints = %s, lease_preferences = '[%s]'`, - target, numNodes, replicaStr, leaseStr) - _, err := conn.ExecContext(ctx, str) - t.Status(str) + query := fmt.Sprintf( + `ALTER %s CONFIGURE ZONE USING num_replicas = %d, constraints = '[%s]', lease_preferences = '[%s]'`, + target, cfg.replicas, constraintsString, leaseString) + t.Status(query) + _, err := conn.ExecContext(ctx, query) require.NoError(t, err) } -// constrainAllConfig will alter the zone config for all zone configurations to -// specify the number of nodes the target can be on, the replicas it is -// prevented from being on and an optional leaseholder. -func constrainAllConfig( - t test.Test, ctx context.Context, conn *gosql.DB, numNodes int, replicas []int, lease int, -) { +// configureAllZones will set zone configuration for all targets in the +// clusters. +func configureAllZones(t test.Test, ctx context.Context, conn *gosql.DB, cfg zoneConfig) { rows, err := conn.QueryContext(ctx, `SELECT target FROM [SHOW ALL ZONE CONFIGURATIONS]`) require.NoError(t, err) - for rows.Next() { var target string require.NoError(t, rows.Scan(&target)) - constrainConfig(t, ctx, conn, target, numNodes, replicas, lease) + configureZone(t, ctx, conn, target, cfg) } require.NoError(t, rows.Err()) } From 558454d3fb6a7272b6f3e1c1e190bce6b5a26472 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Sat, 13 May 2023 14:47:31 +0000 Subject: [PATCH 2/5] roachtest: improve `failover/partial/lease-liveness` This test did not sufficiently constrain range/lease placement, which caused occasional permanent unavailability as it randomly hit other failure modes than the one it's trying to test. This patch makes the test more prescriptive, by separating out the system ranges, SQL gateways, liveness leaseholder, and user ranges, and only introducing a partial partition between a user leaseholder and the liveness leaseholder. Epic: none Release note: None --- pkg/cmd/roachtest/tests/failover.go | 241 +++++++++++++++++----------- 1 file changed, 148 insertions(+), 93 deletions(-) diff --git a/pkg/cmd/roachtest/tests/failover.go b/pkg/cmd/roachtest/tests/failover.go index f0ee4c5ce3cb..50473b5e77b8 100644 --- a/pkg/cmd/roachtest/tests/failover.go +++ b/pkg/cmd/roachtest/tests/failover.go @@ -14,7 +14,6 @@ import ( "context" gosql "database/sql" "fmt" - "math/rand" "time" "github.com/cockroachdb/cockroach/pkg/base" @@ -40,9 +39,9 @@ func registerFailover(r registry.Registry) { Name: "failover/partial/lease-liveness" + suffix, Owner: registry.OwnerKV, Timeout: 30 * time.Minute, - Cluster: r.MakeClusterSpec(6, spec.CPU(4)), + Cluster: r.MakeClusterSpec(8, spec.CPU(4)), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - runDisconnect(ctx, t, c, expirationLeases) + runFailoverPartialLeaseLiveness(ctx, t, c, expirationLeases) }, }) @@ -95,19 +94,28 @@ func registerFailover(r registry.Registry) { } } -func randSleep(ctx context.Context, rng *rand.Rand, max time.Duration) { - randTimer := time.After(randutil.RandDuration(rng, max)) - select { - case <-randTimer: - case <-ctx.Done(): - } -} - -// 5 nodes fully connected. Break the connection between a pair of nodes 4 and 5 -// while running a workload against nodes 1 through 3. Before each disconnect, -// move all the leases to nodes 4 and 5 in a different pattern. -func runDisconnect(ctx context.Context, t test.Test, c cluster.Cluster, expLeases bool) { - require.Equal(t, 6, c.Spec().NodeCount) +// runFailoverPartialLeaseLiveness tests a partial network partition between a +// leaseholder and node liveness. With epoch leases we would normally expect +// this to recover shortly, since the node can't heartbeat its liveness record +// and thus its leases will expire. However, it will maintain Raft leadership, +// and we prevent non-leaders from acquiring leases, which can prevent the lease +// from moving unless we explicitly handle this. See also: +// https://github.com/cockroachdb/cockroach/pull/87244. +// +// Cluster topology: +// +// n1-n3: system ranges and SQL gateways +// n4: liveness leaseholder +// n5-7: user ranges +// +// A partial blackhole network partition is triggered between n4 and each of +// n5-n7 sequentially, 3 times per node for a total of 9 times. A kv50 workload +// is running against SQL gateways on n1-n3, and we collect the pMax latency for +// graphing. +func runFailoverPartialLeaseLiveness( + ctx context.Context, t test.Test, c cluster.Cluster, expLeases bool, +) { + require.Equal(t, 8, c.Spec().NodeCount) rng, _ := randutil.NewTestRand() @@ -115,8 +123,12 @@ func runDisconnect(ctx context.Context, t test.Test, c cluster.Cluster, expLease opts := option.DefaultStartOpts() settings := install.MakeClusterSettings() + failer := makeFailer(t, c, failureModeBlackhole, opts, settings).(partialFailer) + failer.Setup(ctx) + defer failer.Cleanup(ctx) + c.Put(ctx, t.Cockroach(), "./cockroach") - c.Start(ctx, t.L(), opts, settings, c.Range(1, 5)) + c.Start(ctx, t.L(), opts, settings, c.Range(1, 7)) conn := c.Conn(ctx, t.L(), 1) defer conn.Close() @@ -125,36 +137,45 @@ func runDisconnect(ctx context.Context, t test.Test, c cluster.Cluster, expLease expLeases) require.NoError(t, err) + // Place all ranges on n1-n3, and an extra liveness leaseholder replica on n4. configureAllZones(t, ctx, conn, zoneConfig{replicas: 3, onlyNodes: []int{1, 2, 3}}) configureZone(t, ctx, conn, `RANGE liveness`, zoneConfig{ - replicas: 3, onlyNodes: []int{1, 2, 4}, leaseNode: 4}) + replicas: 4, onlyNodes: []int{1, 2, 3, 4}, leaseNode: 4}) // Wait for upreplication. require.NoError(t, WaitFor3XReplication(ctx, t, conn)) + // Create the kv database on n5-n7. t.Status("creating workload database") _, err = conn.ExecContext(ctx, `CREATE DATABASE kv`) require.NoError(t, err) - configureZone(t, ctx, conn, `DATABASE kv`, zoneConfig{replicas: 3, onlyNodes: []int{2, 3, 5}}) + configureZone(t, ctx, conn, `DATABASE kv`, zoneConfig{replicas: 3, onlyNodes: []int{5, 6, 7}}) - c.Run(ctx, c.Node(6), `./cockroach workload init kv --splits 100 {pgurl:1}`) + c.Run(ctx, c.Node(6), `./cockroach workload init kv --splits 1000 {pgurl:1}`) - // Start workload on n6 using nodes 1-3 (not part of partition). We could - // additionally test the behavior of running SQL against nodes 4-5 however - // that complicates the analysis as we want to focus on KV behavior. + // The replicate queue takes forever to move the ranges, so we do it + // ourselves. Precreating the database/range and moving it to the correct + // nodes first is not sufficient, since workload will spread the ranges across + // all nodes regardless. + relocateRanges(t, ctx, conn, `database_name = 'kv'`, []int{1, 2, 3, 4}, []int{5, 6, 7}) + relocateRanges(t, ctx, conn, `database_name != 'kv'`, []int{5, 6, 7}, []int{1, 2, 3, 4}) + relocateRanges(t, ctx, conn, `range_id != 2`, []int{4}, []int{1, 2, 3}) + + // Start workload on n8 using n1-n3 as gateways (not partitioned). t.Status("running workload") - m := c.NewMonitor(ctx, c.Range(1, 3)) + m := c.NewMonitor(ctx, c.Range(1, 7)) m.Go(func(ctx context.Context) error { - c.Run(ctx, c.Node(6), `./cockroach workload run kv --read-percent 50 `+ - `--duration 10m --concurrency 256 --max-rate 2048 --timeout 1m --tolerate-errors `+ + c.Run(ctx, c.Node(8), `./cockroach workload run kv --read-percent 50 `+ + `--duration 20m --concurrency 256 --max-rate 2048 --timeout 1m --tolerate-errors `+ `--histograms=`+t.PerfArtifactsDir()+`/stats.json `+ `{pgurl:1-3}`) return nil }) - // Make sure we don't leave an outage if this test fails midway. - defer Cleanup(t, c, ctx) - // Start and stop partial between nodes 4 and 5 every 30 seconds. + // Start a worker to fail and recover partial partitions between n4 (liveness) + // and workload leaseholders n5-n7 for 1 minute each, 3 times per node for 9 + // times total. + failer.Ready(ctx, m) m.Go(func(ctx context.Context) error { var raftCfg base.RaftConfig raftCfg.SetDefaults() @@ -162,30 +183,43 @@ func runDisconnect(ctx context.Context, t test.Test, c cluster.Cluster, expLease ticker := time.NewTicker(time.Minute) defer ticker.Stop() - // All the system ranges will be on all the nodes, so they will all move, - // plus many of the non-system ranges. - for i := 0; i < 9; i++ { - t.Status("Moving ranges to nodes 4 and 5 before partition", i) - relocateLeases(t, ctx, conn, `range_id = 2`, 4) - relocateLeases(t, ctx, conn, `voting_replicas @> ARRAY[5] AND range_id != 2`, 5) + for i := 0; i < 3; i++ { + for _, node := range []int{5, 6, 7} { + select { + case <-ticker.C: + case <-ctx.Done(): + return ctx.Err() + } - // Randomly sleep up to the lease renewal interval, to vary the time - // between the last lease renewal and the failure. - randSleep(ctx, rng, raftCfg.RangeLeaseRenewalDuration()) + randTimer := time.After(randutil.RandDuration(rng, raftCfg.RangeLeaseRenewalDuration())) - t.Status("disconnecting n4 and n5") - Disconnect(t, c, ctx, []int{4, 5}) + // Ranges and leases may occasionally escape their constraints. Move + // them to where they should be. + relocateRanges(t, ctx, conn, `database_name = 'kv'`, []int{1, 2, 3, 4}, []int{5, 6, 7}) + relocateRanges(t, ctx, conn, `database_name != 'kv'`, []int{node}, []int{1, 2, 3}) + relocateRanges(t, ctx, conn, `range_id = 2`, []int{5, 6, 7}, []int{1, 2, 3, 4}) + relocateLeases(t, ctx, conn, `range_id = 2`, 4) - select { - case <-ticker.C: - case <-ctx.Done(): - return ctx.Err() - } + // Randomly sleep up to the lease renewal interval, to vary the time + // between the last lease renewal and the failure. We start the timer + // before the range relocation above to run them concurrently. + select { + case <-randTimer: + case <-ctx.Done(): + } - qps := measureQPS(ctx, t, conn, 5*time.Second) - t.Status("Node 1 QPS after waiting is: ", qps, " recovering nodes") + t.Status(fmt.Sprintf("failing n%d (blackhole lease/liveness)", node)) + failer.FailPartial(ctx, node, []int{4}) - Cleanup(t, c, ctx) + select { + case <-ticker.C: + case <-ctx.Done(): + return ctx.Err() + } + + t.Status(fmt.Sprintf("recovering n%d (blackhole lease/liveness)", node)) + failer.Recover(ctx, node) + } } return nil }) @@ -696,38 +730,6 @@ func makeFailer( } } -// Disconnect takes a set of nodes and each nodes internal ips. It disconnects -// each node from all the others in the list. -func Disconnect(t test.Test, c cluster.Cluster, ctx context.Context, nodes []int) { - if c.IsLocal() { - t.Status("skipping iptables disconnect on local cluster") - return - } - - ips, err := c.InternalIP(ctx, t.L(), nodes) - require.NoError(t, err) - - // disconnect each node from every other passed in node. - for n := 0; n < len(nodes); n++ { - for ip := 0; ip < len(ips); ip++ { - if n != ip { - c.Run(ctx, c.Node(nodes[n]), `sudo iptables -A INPUT -s `+ips[ip]+` -j DROP`) - c.Run(ctx, c.Node(nodes[n]), `sudo iptables -A OUTPUT -d `+ips[ip]+` -j DROP`) - } - } - } -} - -// Cleanup takes a set of nodes and each nodes internal ips. It disconnects -// each node from all the others in the list. -func Cleanup(t test.Test, c cluster.Cluster, ctx context.Context) { - if c.IsLocal() { - t.Status("skipping iptables cleanup on local cluster") - return - } - c.Run(ctx, c.All(), `sudo iptables -F`) -} - // failer fails and recovers a given node in some particular way. type failer interface { // Setup prepares the failer. It is called before the cluster is started. @@ -747,6 +749,14 @@ type failer interface { Recover(ctx context.Context, nodeID int) } +// partialFailer supports partial failures between specific node pairs. +type partialFailer interface { + failer + + // FailPartial fails the node for the given peers. + FailPartial(ctx context.Context, nodeID int, peerIDs []int) +} + // blackholeFailer causes a network failure where TCP/IP packets to/from port // 26257 are dropped, causing network hangs and timeouts. // @@ -765,7 +775,7 @@ func (f *blackholeFailer) Ready(_ context.Context, _ cluster.Monitor) {} func (f *blackholeFailer) Cleanup(ctx context.Context) { if f.c.IsLocal() { - f.t.Status("skipping iptables cleanup on local cluster") + f.t.Status("skipping blackhole cleanup on local cluster") return } f.c.Run(ctx, f.c.All(), `sudo iptables -F`) @@ -773,21 +783,25 @@ func (f *blackholeFailer) Cleanup(ctx context.Context) { func (f *blackholeFailer) Fail(ctx context.Context, nodeID int) { if f.c.IsLocal() { - f.t.Status("skipping fail on local cluster") + f.t.Status("skipping blackhole failure on local cluster") return } - // When dropping both input and output, we use multiport to block traffic both - // to port 26257 and from port 26257 on either side of the connection, to - // avoid any spurious packets from making it through. + // When dropping both input and output, make sure we drop packets in both + // directions for both the inbound and outbound TCP connections, such that we + // get a proper black hole. Only dropping one direction for both of INPUT and + // OUTPUT will still let e.g. TCP retransmits through, which may affect the + // TCP stack behavior and is not representative of real network outages. // - // We don't do this when only blocking in one direction, because e.g. in the - // input case we don't want inbound connections to work (INPUT to 26257), but - // we do want responses for outbound connections to work (INPUT from 26257). + // For the asymmetric partitions, only drop packets in one direction since + // this is representative of accidental firewall rules we've seen cause such + // outages in the wild. if f.input && f.output { - f.c.Run(ctx, f.c.Node(nodeID), - `sudo iptables -A INPUT -m multiport -p tcp --ports 26257 -j DROP`) - f.c.Run(ctx, f.c.Node(nodeID), - `sudo iptables -A OUTPUT -m multiport -p tcp --ports 26257 -j DROP`) + // Inbound TCP connections, both received and sent packets. + f.c.Run(ctx, f.c.Node(nodeID), `sudo iptables -A INPUT -p tcp --dport 26257 -j DROP`) + f.c.Run(ctx, f.c.Node(nodeID), `sudo iptables -A OUTPUT -p tcp --sport 26257 -j DROP`) + // Outbound TCP connections, both sent and received packets. + f.c.Run(ctx, f.c.Node(nodeID), `sudo iptables -A OUTPUT -p tcp --dport 26257 -j DROP`) + f.c.Run(ctx, f.c.Node(nodeID), `sudo iptables -A INPUT -p tcp --sport 26257 -j DROP`) } else if f.input { f.c.Run(ctx, f.c.Node(nodeID), `sudo iptables -A INPUT -p tcp --dport 26257 -j DROP`) } else if f.output { @@ -795,9 +809,50 @@ func (f *blackholeFailer) Fail(ctx context.Context, nodeID int) { } } +// FailPartial creates a partial blackhole failure between the given node and +// peers. +func (f *blackholeFailer) FailPartial(ctx context.Context, nodeID int, peerIDs []int) { + if f.c.IsLocal() { + f.t.Status("skipping blackhole failure on local cluster") + return + } + peerIPs, err := f.c.InternalIP(ctx, f.t.L(), peerIDs) + require.NoError(f.t, err) + + for _, peerIP := range peerIPs { + // When dropping both input and output, make sure we drop packets in both + // directions for both the inbound and outbound TCP connections, such that + // we get a proper black hole. Only dropping one direction for both of INPUT + // and OUTPUT will still let e.g. TCP retransmits through, which may affect + // TCP stack behavior and is not representative of real network outages. + // + // For the asymmetric partitions, only drop packets in one direction since + // this is representative of accidental firewall rules we've seen cause such + // outages in the wild. + if f.input && f.output { + // Inbound TCP connections, both received and sent packets. + f.c.Run(ctx, f.c.Node(nodeID), fmt.Sprintf( + `sudo iptables -A INPUT -p tcp -s %s --dport 26257 -j DROP`, peerIP)) + f.c.Run(ctx, f.c.Node(nodeID), fmt.Sprintf( + `sudo iptables -A OUTPUT -p tcp -d %s --sport 26257 -j DROP`, peerIP)) + // Outbound TCP connections, both sent and received packets. + f.c.Run(ctx, f.c.Node(nodeID), fmt.Sprintf( + `sudo iptables -A OUTPUT -p tcp -d %s --dport 26257 -j DROP`, peerIP)) + f.c.Run(ctx, f.c.Node(nodeID), fmt.Sprintf( + `sudo iptables -A INPUT -p tcp -s %s --sport 26257 -j DROP`, peerIP)) + } else if f.input { + f.c.Run(ctx, f.c.Node(nodeID), fmt.Sprintf( + `sudo iptables -A INPUT -p tcp -s %s --dport 26257 -j DROP`, peerIP)) + } else if f.output { + f.c.Run(ctx, f.c.Node(nodeID), fmt.Sprintf( + `sudo iptables -A OUTPUT -p tcp -d %s --dport 26257 -j DROP`, peerIP)) + } + } +} + func (f *blackholeFailer) Recover(ctx context.Context, nodeID int) { if f.c.IsLocal() { - f.t.Status("skipping iptables recover on local cluster") + f.t.Status("skipping blackhole recovery on local cluster") return } f.c.Run(ctx, f.c.Node(nodeID), `sudo iptables -F`) From 98c45d554faafc3d1029b0f13880945ba3857846 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Sat, 13 May 2023 16:38:18 +0000 Subject: [PATCH 3/5] roachtest: add `failover/partial/lease-gateway` This patch adds a roachtest that benchmarks the pMax unavailability during a partial network partition between a SQL gateway and a leaseholder. We currently don't handle this failure mode at all, and expect this to result in permanent unavailability. Epic: none Release note: None --- pkg/cmd/roachtest/tests/failover.go | 199 ++++++++++++++++++++++++++++ 1 file changed, 199 insertions(+) diff --git a/pkg/cmd/roachtest/tests/failover.go b/pkg/cmd/roachtest/tests/failover.go index 50473b5e77b8..27d518c7e565 100644 --- a/pkg/cmd/roachtest/tests/failover.go +++ b/pkg/cmd/roachtest/tests/failover.go @@ -35,6 +35,16 @@ func registerFailover(r registry.Registry) { suffix = "/lease=expiration" } + r.Add(registry.TestSpec{ + Name: "failover/partial/lease-gateway" + suffix, + Owner: registry.OwnerKV, + Timeout: 30 * time.Minute, + Cluster: r.MakeClusterSpec(8, spec.CPU(4)), + Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { + runFailoverPartialLeaseGateway(ctx, t, c, expirationLeases) + }, + }) + r.Add(registry.TestSpec{ Name: "failover/partial/lease-liveness" + suffix, Owner: registry.OwnerKV, @@ -94,6 +104,169 @@ func registerFailover(r registry.Registry) { } } +// runFailoverPartialLeaseGateway tests a partial network partition between a +// SQL gateway and a user range leaseholder. These must be routed via other +// nodes to be able to serve the request. +// +// Cluster topology: +// +// n1-n3: system ranges and user ranges (2/5 replicas) +// n4-n5: user range leaseholders (2/5 replicas) +// n6-n7: SQL gateways and 1 user replica (1/5 replicas) +// +// 5 user range replicas will be placed on n2-n6, with leases on n4. A partial +// partition will be introduced between n4,n5 and n6,n7, both fully and +// individually. This corresponds to the case where we have three data centers +// with a broken network link between one pair. For example: +// +// n1-n3 (2 replicas, liveness) +// A +// / \ +// / \ +// n4-n5 B --x-- C n6-n7 <--- n8 (workload) +// (2 replicas, leases) (1 replica, SQL gateways) +// +// Once follower routing is implemented, this tests the following scenarios: +// +// - Routes via followers in both A, B, and C when possible. +// - Skips follower replica on local node that can't reach leaseholder (n6). +// - Skips follower replica in C that can't reach leaseholder (n7 via n6). +// - Skips follower replica in B that's unreachable (n5). +// +// We run a kv50 workload on SQL gateways and collect pMax latency for graphing. +func runFailoverPartialLeaseGateway( + ctx context.Context, t test.Test, c cluster.Cluster, expLeases bool, +) { + require.Equal(t, 8, c.Spec().NodeCount) + + rng, _ := randutil.NewTestRand() + + // Create cluster. + opts := option.DefaultStartOpts() + settings := install.MakeClusterSettings() + + failer := makeFailer(t, c, failureModeBlackhole, opts, settings).(partialFailer) + failer.Setup(ctx) + defer failer.Cleanup(ctx) + + c.Put(ctx, t.Cockroach(), "./cockroach") + c.Start(ctx, t.L(), opts, settings, c.Range(1, 7)) + + conn := c.Conn(ctx, t.L(), 1) + defer conn.Close() + + _, err := conn.ExecContext(ctx, `SET CLUSTER SETTING kv.expiration_leases_only.enabled = $1`, + expLeases) + require.NoError(t, err) + + // Place all ranges on n1-n3 to start with. + configureAllZones(t, ctx, conn, zoneConfig{replicas: 3, onlyNodes: []int{1, 2, 3}}) + + // Wait for upreplication. + require.NoError(t, WaitFor3XReplication(ctx, t, conn)) + + // Create the kv database with 5 replicas on n2-n6, and leases on n4. + t.Status("creating workload database") + _, err = conn.ExecContext(ctx, `CREATE DATABASE kv`) + require.NoError(t, err) + configureZone(t, ctx, conn, `DATABASE kv`, zoneConfig{ + replicas: 5, onlyNodes: []int{2, 3, 4, 5, 6}, leaseNode: 4}) + + c.Run(ctx, c.Node(6), `./cockroach workload init kv --splits 1000 {pgurl:1}`) + + // Wait for the KV table to upreplicate. + waitForUpreplication(t, ctx, conn, `database_name = 'kv'`, 5) + + // The replicate queue takes forever to move the ranges, so we do it + // ourselves. Precreating the database/range and moving it to the correct + // nodes first is not sufficient, since workload will spread the ranges across + // all nodes regardless. + relocateRanges(t, ctx, conn, `database_name = 'kv'`, []int{1, 7}, []int{2, 3, 4, 5, 6}) + relocateRanges(t, ctx, conn, `database_name != 'kv'`, []int{4, 5, 6, 7}, []int{1, 2, 3}) + relocateLeases(t, ctx, conn, `database_name = 'kv'`, 4) + + // Start workload on n8 using n6-n7 as gateways. + t.Status("running workload") + m := c.NewMonitor(ctx, c.Range(1, 7)) + m.Go(func(ctx context.Context) error { + c.Run(ctx, c.Node(8), `./cockroach workload run kv --read-percent 50 `+ + `--duration 20m --concurrency 256 --max-rate 2048 --timeout 1m --tolerate-errors `+ + `--histograms=`+t.PerfArtifactsDir()+`/stats.json `+ + `{pgurl:6-7}`) + return nil + }) + + // Start a worker to fail and recover partial partitions between n4,n5 + // (leases) and n6,n7 (gateways), both fully and individually, for 3 cycles. + // Leases are only placed on n4. + failer.Ready(ctx, m) + m.Go(func(ctx context.Context) error { + var raftCfg base.RaftConfig + raftCfg.SetDefaults() + + ticker := time.NewTicker(time.Minute) + defer ticker.Stop() + + for i := 0; i < 3; i++ { + testcases := []struct { + nodes []int + peers []int + }{ + // Fully partition leases from gateways, must route via n1-n3. In + // addition to n4 leaseholder being unreachable, follower on n5 is + // unreachable, and follower replica on n6 can't reach leaseholder. + {[]int{6, 7}, []int{4, 5}}, + // Partition n6 (gateway with local follower) from n4 (leaseholder). + // Local follower replica can't reach leaseholder. + {[]int{6}, []int{4}}, + // Partition n7 (gateway) from n4 (leaseholder). + {[]int{7}, []int{4}}, + } + for _, tc := range testcases { + select { + case <-ticker.C: + case <-ctx.Done(): + return ctx.Err() + } + + randTimer := time.After(randutil.RandDuration(rng, raftCfg.RangeLeaseRenewalDuration())) + + // Ranges and leases may occasionally escape their constraints. Move + // them to where they should be. + relocateRanges(t, ctx, conn, `database_name = 'kv'`, []int{1, 7}, []int{2, 3, 4, 5, 6}) + relocateRanges(t, ctx, conn, `database_name != 'kv'`, []int{4, 5, 6, 7}, []int{1, 2, 3}) + relocateLeases(t, ctx, conn, `database_name = 'kv'`, 4) + + // Randomly sleep up to the lease renewal interval, to vary the time + // between the last lease renewal and the failure. We start the timer + // before the range relocation above to run them concurrently. + select { + case <-randTimer: + case <-ctx.Done(): + } + + for _, node := range tc.nodes { + t.Status(fmt.Sprintf("failing n%d (blackhole lease/gateway)", node)) + failer.FailPartial(ctx, node, tc.peers) + } + + select { + case <-ticker.C: + case <-ctx.Done(): + return ctx.Err() + } + + for _, node := range tc.nodes { + t.Status(fmt.Sprintf("recovering n%d (blackhole lease/gateway)", node)) + failer.Recover(ctx, node) + } + } + } + return nil + }) + m.Wait() +} + // runFailoverPartialLeaseLiveness tests a partial network partition between a // leaseholder and node liveness. With epoch leases we would normally expect // this to recover shortly, since the node can't heartbeat its liveness record @@ -947,6 +1120,32 @@ func (f *pauseFailer) Recover(ctx context.Context, nodeID int) { f.c.Signal(ctx, f.t.L(), 18, f.c.Node(nodeID)) // SIGCONT } +// waitForUpreplication waits for upreplication of ranges that satisfy the +// given predicate (using SHOW RANGES). +// +// TODO(erikgrinaker): move this into WaitForReplication() when it can use SHOW +// RANGES, i.e. when it's no longer needed in mixed-version tests with older +// versions that don't have SHOW RANGES. +func waitForUpreplication( + t test.Test, ctx context.Context, conn *gosql.DB, predicate string, replicationFactor int, +) { + var count int + where := fmt.Sprintf("WHERE array_length(replicas, 1) < %d", replicationFactor) + if predicate != "" { + where += fmt.Sprintf(" AND (%s)", predicate) + } + for { + require.NoError(t, conn.QueryRowContext(ctx, + `SELECT count(distinct range_id) FROM [SHOW CLUSTER RANGES WITH TABLES, DETAILS] `+where). + Scan(&count)) + if count == 0 { + break + } + t.Status(fmt.Sprintf("waiting for %d ranges to upreplicate (%s)", count, predicate)) + time.Sleep(time.Second) + } +} + // relocateRanges relocates all ranges matching the given predicate from a set // of nodes to a different set of nodes. Moves are attempted sequentially from // each source onto each target, and errors are retried indefinitely. From 6e4a512f26064f19b9ed5d2d1a4ec06219f4cd12 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Sun, 14 May 2023 01:05:01 +0000 Subject: [PATCH 4/5] kvserver: add `COCKROACH_DISABLE_LEADER_FOLLOWS_LEASEHOLDER` This patch adds `COCKROACH_DISABLE_LEADER_FOLLOWS_LEASEHOLDER` which will disable colocation of the Raft leader and leaseholder. This is useful for tests. Epic: none Release note: None --- pkg/kv/kvserver/replica_range_lease.go | 4 ++++ pkg/kv/kvserver/store.go | 4 ++++ pkg/kv/kvserver/testing_knobs.go | 3 ++- 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pkg/kv/kvserver/replica_range_lease.go b/pkg/kv/kvserver/replica_range_lease.go index 06bef49699bd..f954f9496511 100644 --- a/pkg/kv/kvserver/replica_range_lease.go +++ b/pkg/kv/kvserver/replica_range_lease.go @@ -857,6 +857,10 @@ func (r *Replica) requestLeaseLocked( func (r *Replica) AdminTransferLease( ctx context.Context, target roachpb.StoreID, bypassSafetyChecks bool, ) error { + if r.store.cfg.TestingKnobs.DisableLeaderFollowsLeaseholder { + // Ensure lease transfers still work when we don't colocate leaders and leases. + bypassSafetyChecks = true + } // initTransferHelper inits a transfer if no extension is in progress. // It returns a channel for waiting for the result of a pending // extension (if any is in progress) and a channel for waiting for the diff --git a/pkg/kv/kvserver/store.go b/pkg/kv/kvserver/store.go index 3d333701dc3d..47c66f1167f9 100644 --- a/pkg/kv/kvserver/store.go +++ b/pkg/kv/kvserver/store.go @@ -1205,6 +1205,10 @@ func (sc *StoreConfig) SetDefaults(numStores int) { if sc.RaftEntryCacheSize == 0 { sc.RaftEntryCacheSize = defaultRaftEntryCacheSize } + if envutil.EnvOrDefaultBool("COCKROACH_DISABLE_LEADER_FOLLOWS_LEASEHOLDER", false) { + sc.TestingKnobs.DisableLeaderFollowsLeaseholder = true + sc.TestingKnobs.AllowLeaseRequestProposalsWhenNotLeader = true // otherwise lease requests fail + } } // GetStoreConfig exposes the config used for this store. diff --git a/pkg/kv/kvserver/testing_knobs.go b/pkg/kv/kvserver/testing_knobs.go index 46e3806fe837..ec5a6a468120 100644 --- a/pkg/kv/kvserver/testing_knobs.go +++ b/pkg/kv/kvserver/testing_knobs.go @@ -214,7 +214,8 @@ type StoreTestingKnobs struct { // DisableScanner disables the replica scanner. DisableScanner bool // DisableLeaderFollowsLeaseholder disables attempts to transfer raft - // leadership when it diverges from the range's leaseholder. + // leadership when it diverges from the range's leaseholder. This can + // also be set via COCKROACH_DISABLE_LEADER_FOLLOWS_LEASEHOLDER. DisableLeaderFollowsLeaseholder bool // DisableRefreshReasonNewLeader disables refreshing pending commands when a new // leader is discovered. From 3f53f25b8e054eb21e97bd29f9807645ef783e5a Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Sun, 14 May 2023 01:08:43 +0000 Subject: [PATCH 5/5] roachtest: add `failover/partial/lease-leader` This patch adds a roachtest that benchmarks unavailability during a partial partition between a Raft leader and leaseholder. Epic: none Release note: None --- pkg/cmd/roachtest/tests/failover.go | 174 ++++++++++++++++++++++++++++ 1 file changed, 174 insertions(+) diff --git a/pkg/cmd/roachtest/tests/failover.go b/pkg/cmd/roachtest/tests/failover.go index 27d518c7e565..57a06c471872 100644 --- a/pkg/cmd/roachtest/tests/failover.go +++ b/pkg/cmd/roachtest/tests/failover.go @@ -45,6 +45,16 @@ func registerFailover(r registry.Registry) { }, }) + r.Add(registry.TestSpec{ + Name: "failover/partial/lease-leader" + suffix, + Owner: registry.OwnerKV, + Timeout: 30 * time.Minute, + Cluster: r.MakeClusterSpec(7, spec.CPU(4)), + Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { + runFailoverPartialLeaseLeader(ctx, t, c, expirationLeases) + }, + }) + r.Add(registry.TestSpec{ Name: "failover/partial/lease-liveness" + suffix, Owner: registry.OwnerKV, @@ -267,6 +277,159 @@ func runFailoverPartialLeaseGateway( m.Wait() } +// runFailoverLeaseLeader tests a partial network partition between leaseholders +// and Raft leaders. These will prevent the leaseholder from making Raft +// proposals, but it can still hold onto leases as long as it can heartbeat +// liveness. +// +// Cluster topology: +// +// n1-n3: system and liveness ranges, SQL gateway +// n4-n6: user ranges +// +// The cluster runs with COCKROACH_DISABLE_LEADER_FOLLOWS_LEASEHOLDER, which +// will place Raft leaders and leases independently of each other. We can then +// assume that some number of user ranges will randomly have split leader/lease, +// and simply create partial partitions between each of n4-n6 in sequence. +// +// We run a kv50 workload on SQL gateways and collect pMax latency for graphing. +func runFailoverPartialLeaseLeader( + ctx context.Context, t test.Test, c cluster.Cluster, expLeases bool, +) { + require.Equal(t, 7, c.Spec().NodeCount) + + rng, _ := randutil.NewTestRand() + + // Create cluster, disabling leader/leaseholder colocation. We only start + // n1-n3, to precisely place system ranges, since we'll have to disable the + // replicate queue shortly. + opts := option.DefaultStartOpts() + settings := install.MakeClusterSettings() + settings.Env = append(settings.Env, "COCKROACH_DISABLE_LEADER_FOLLOWS_LEASEHOLDER=true") + + failer := makeFailer(t, c, failureModeBlackhole, opts, settings).(partialFailer) + failer.Setup(ctx) + defer failer.Cleanup(ctx) + + c.Put(ctx, t.Cockroach(), "./cockroach") + c.Start(ctx, t.L(), opts, settings, c.Range(1, 3)) + + conn := c.Conn(ctx, t.L(), 1) + defer conn.Close() + + _, err := conn.ExecContext(ctx, `SET CLUSTER SETTING kv.expiration_leases_only.enabled = $1`, + expLeases) + require.NoError(t, err) + + // Place all ranges on n1-n3 to start with, and wait for upreplication. + configureAllZones(t, ctx, conn, zoneConfig{replicas: 3, onlyNodes: []int{1, 2, 3}}) + require.NoError(t, WaitFor3XReplication(ctx, t, conn)) + + // Disable the replicate queue. It can otherwise end up with stuck + // overreplicated ranges during rebalancing, because downreplication requires + // the Raft leader to be colocated with the leaseholder. + _, err = conn.ExecContext(ctx, `SET CLUSTER SETTING kv.replicate_queue.enabled = false`) + require.NoError(t, err) + + // Now that system ranges are properly placed on n1-n3, start n4-n6. + c.Start(ctx, t.L(), opts, settings, c.Range(4, 6)) + + // Create the kv database on n4-n6. + t.Status("creating workload database") + _, err = conn.ExecContext(ctx, `CREATE DATABASE kv`) + require.NoError(t, err) + configureZone(t, ctx, conn, `DATABASE kv`, zoneConfig{replicas: 3, onlyNodes: []int{4, 5, 6}}) + + c.Run(ctx, c.Node(6), `./cockroach workload init kv --splits 1000 {pgurl:1}`) + + // Move ranges to the appropriate nodes. Precreating the database/range and + // moving it to the correct nodes first is not sufficient, since workload will + // spread the ranges across all nodes regardless. + relocateRanges(t, ctx, conn, `database_name = 'kv'`, []int{1, 2, 3}, []int{4, 5, 6}) + relocateRanges(t, ctx, conn, `database_name != 'kv'`, []int{4, 5, 6}, []int{1, 2, 3}) + + // Check that we have a few split leaders/leaseholders on n4-n6. We give + // it a few seconds, since metrics are updated every 10 seconds. + for i := 0; ; i++ { + var count float64 + for _, node := range []int{4, 5, 6} { + count += nodeMetric(ctx, t, c, node, "replicas.leaders_not_leaseholders") + } + t.Status(fmt.Sprintf("%.0f split leaders/leaseholders", count)) + if count >= 3 { + break + } else if i >= 10 { + t.Fatalf("timed out waiting for 3 split leaders/leaseholders") + } + time.Sleep(time.Second) + } + + // Start workload on n7 using n1-n3 as gateways. + t.Status("running workload") + m := c.NewMonitor(ctx, c.Range(1, 6)) + m.Go(func(ctx context.Context) error { + c.Run(ctx, c.Node(7), `./cockroach workload run kv --read-percent 50 `+ + `--duration 20m --concurrency 256 --max-rate 2048 --timeout 1m --tolerate-errors `+ + `--histograms=`+t.PerfArtifactsDir()+`/stats.json `+ + `{pgurl:1-3}`) + return nil + }) + + // Start a worker to fail and recover partial partitions between each pair of + // n4-n6 for 3 cycles (9 failures total). + failer.Ready(ctx, m) + m.Go(func(ctx context.Context) error { + var raftCfg base.RaftConfig + raftCfg.SetDefaults() + + ticker := time.NewTicker(time.Minute) + defer ticker.Stop() + + for i := 0; i < 3; i++ { + for _, node := range []int{4, 5, 6} { + select { + case <-ticker.C: + case <-ctx.Done(): + return ctx.Err() + } + + randTimer := time.After(randutil.RandDuration(rng, raftCfg.RangeLeaseRenewalDuration())) + + // Ranges may occasionally escape their constraints. Move them to where + // they should be. + relocateRanges(t, ctx, conn, `database_name = 'kv'`, []int{1, 2, 3}, []int{4, 5, 6}) + relocateRanges(t, ctx, conn, `database_name != 'kv'`, []int{4, 5, 6}, []int{1, 2, 3}) + + // Randomly sleep up to the lease renewal interval, to vary the time + // between the last lease renewal and the failure. We start the timer + // before the range relocation above to run them concurrently. + select { + case <-randTimer: + case <-ctx.Done(): + } + + t.Status(fmt.Sprintf("failing n%d (blackhole lease/leader)", node)) + nextNode := node + 1 + if nextNode > 6 { + nextNode = 4 + } + failer.FailPartial(ctx, node, []int{nextNode}) + + select { + case <-ticker.C: + case <-ctx.Done(): + return ctx.Err() + } + + t.Status(fmt.Sprintf("recovering n%d (blackhole lease/leader)", node)) + failer.Recover(ctx, node) + } + } + return nil + }) + m.Wait() +} + // runFailoverPartialLeaseLiveness tests a partial network partition between a // leaseholder and node liveness. With epoch leases we would normally expect // this to recover shortly, since the node can't heartbeat its liveness record @@ -1262,3 +1425,14 @@ func configureAllZones(t test.Test, ctx context.Context, conn *gosql.DB, cfg zon } require.NoError(t, rows.Err()) } + +// nodeMetric fetches the given metric value from the given node. +func nodeMetric( + ctx context.Context, t test.Test, c cluster.Cluster, node int, metric string, +) float64 { + var value float64 + err := c.Conn(ctx, t.L(), node).QueryRowContext( + ctx, `SELECT value FROM crdb_internal.node_metrics WHERE name = $1`, metric).Scan(&value) + require.NoError(t, err) + return value +}