From a756ff57dbd471906998d6b2dad27238db2a8785 Mon Sep 17 00:00:00 2001 From: Peter Mattis Date: Sat, 1 Sep 2018 10:32:27 -0400 Subject: [PATCH] roachtest: add acceptance/gossip/restart-node-one Move the gossip-restart-first-node-needs-incoming acceptance test to a new acceptance/gossip/restart-node-one roachtest. See #29151 Fixes #29115 Release note: None --- pkg/acceptance/gossip_peerings_test.go | 307 ------------------------- pkg/acceptance/util_gossip.go | 82 ------- pkg/cmd/roachtest/acceptance.go | 2 +- pkg/cmd/roachtest/gossip.go | 110 ++++++++- 4 files changed, 105 insertions(+), 396 deletions(-) delete mode 100644 pkg/acceptance/gossip_peerings_test.go delete mode 100644 pkg/acceptance/util_gossip.go diff --git a/pkg/acceptance/gossip_peerings_test.go b/pkg/acceptance/gossip_peerings_test.go deleted file mode 100644 index 211702bdaac6..000000000000 --- a/pkg/acceptance/gossip_peerings_test.go +++ /dev/null @@ -1,307 +0,0 @@ -// Copyright 2015 The Cockroach Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the License. - -package acceptance - -import ( - "context" - "io/ioutil" - "math/rand" - "path/filepath" - "testing" - "time" - - "github.com/cockroachdb/cockroach/pkg/acceptance/cluster" - "github.com/cockroachdb/cockroach/pkg/acceptance/localcluster" - "github.com/cockroachdb/cockroach/pkg/testutils" - "github.com/cockroachdb/cockroach/pkg/util/log" - "github.com/cockroachdb/cockroach/pkg/util/retry" - "github.com/cockroachdb/cockroach/pkg/util/timeutil" - "github.com/pkg/errors" -) - -const waitTime = 2 * time.Minute - -func TestGossipPeerings(t *testing.T) { - s := log.Scope(t) - defer s.Close(t) - - RunLocal(t, func(t *testing.T) { - runTestWithCluster(t, testGossipPeeringsInner) - }) -} - -func testGossipPeeringsInner( - ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig, -) { - num := c.NumNodes() - - deadline := timeutil.Now().Add(cfg.Duration) - - for timeutil.Now().Before(deadline) { - if err := CheckGossip(ctx, c, waitTime, HasPeers(num)); err != nil { - t.Fatal(err) - } - - // Restart the first node. - log.Infof(ctx, "restarting node 0") - if err := c.Restart(ctx, 0); err != nil { - t.Fatal(err) - } - if err := CheckGossip(ctx, c, waitTime, HasPeers(num)); err != nil { - t.Fatal(err) - } - - // Restart another node (if there is one). - var pickedNode int - if num > 1 { - pickedNode = rand.Intn(num-1) + 1 - } - log.Infof(ctx, "restarting node %d", pickedNode) - if err := c.Restart(ctx, pickedNode); err != nil { - t.Fatal(err) - } - if err := CheckGossip(ctx, c, waitTime, HasPeers(num)); err != nil { - t.Fatal(err) - } - } -} - -// TestGossipRestart verifies that the gossip network can be -// re-bootstrapped after a time when all nodes were down -// simultaneously. -func TestGossipRestart(t *testing.T) { - s := log.Scope(t) - defer s.Close(t) - - ctx := context.Background() - cfg := ReadConfigFromFlags() - RunLocal(t, func(t *testing.T) { - c := StartCluster(ctx, t, cfg) - defer c.AssertAndStop(ctx, t) - - testGossipRestartInner(ctx, t, c, cfg) - }) -} - -func testGossipRestartInner( - ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig, -) { - // This already replicates the first range (in the local setup). - // The replication of the first range is important: as long as the - // first range only exists on one node, that node can trivially - // acquire the range lease. Once the range is replicated, however, - // nodes must be able to discover each other over gossip before the - // lease can be acquired. - num := c.NumNodes() - - deadline := timeutil.Now().Add(cfg.Duration) - - for timeutil.Now().Before(deadline) { - log.Infof(ctx, "waiting for initial gossip connections") - if err := CheckGossip(ctx, c, waitTime, HasPeers(num)); err != nil { - t.Fatal(err) - } - if err := CheckGossip(ctx, c, waitTime, hasClusterID); err != nil { - t.Fatal(err) - } - if err := CheckGossip(ctx, c, waitTime, hasSentinel); err != nil { - t.Fatal(err) - } - - log.Infof(ctx, "killing all nodes") - for i := 0; i < num; i++ { - if err := c.Kill(ctx, i); err != nil { - t.Fatal(err) - } - } - - log.Infof(ctx, "restarting all nodes") - var chs []<-chan error - for i := 0; i < num; i++ { - // We need to restart asynchronously because the local cluster nodes - // like to wait until they are ready to serve. - chs = append(chs, c.(*localcluster.LocalCluster).RestartAsync(ctx, i)) - } - for i, ch := range chs { - if err := <-ch; err != nil { - t.Errorf("error restarting node %d: %s", i, err) - } - } - if t.Failed() { - t.FailNow() - } - - testClusterConnectedAndFunctional(ctx, t, c) - } -} - -func testClusterConnectedAndFunctional(ctx context.Context, t *testing.T, c cluster.Cluster) { - num := c.NumNodes() - - log.Infof(ctx, "waiting for gossip to be connected") - if err := CheckGossip(ctx, c, waitTime, HasPeers(num)); err != nil { - t.Fatal(err) - } - if err := CheckGossip(ctx, c, waitTime, hasClusterID); err != nil { - t.Fatal(err) - } - if err := CheckGossip(ctx, c, waitTime, hasSentinel); err != nil { - t.Fatal(err) - } - - for i := 0; i < num; i++ { - db, err := c.NewDB(ctx, i) - if err != nil { - t.Fatal(err) - } - if i == 0 { - if _, err := db.Exec("CREATE DATABASE IF NOT EXISTS test"); err != nil { - t.Fatal(err) - } - if _, err := db.Exec("CREATE TABLE IF NOT EXISTS test.kv (k INT PRIMARY KEY, v INT)"); err != nil { - t.Fatal(err) - } - if _, err := db.Exec(`UPSERT INTO test.kv (k, v) VALUES (1, 0)`); err != nil { - t.Fatal(err) - } - } - rows, err := db.Query(`UPDATE test.kv SET v=v+1 WHERE k=1 RETURNING v`) - if err != nil { - t.Fatal(err) - } - defer rows.Close() - var count int - if rows.Next() { - if err := rows.Scan(&count); err != nil { - t.Fatal(err) - } - if count != (i + 1) { - t.Fatalf("unexpected value %d for write #%d (expected %d)", count, i, i+1) - } - } else { - t.Fatalf("no results found from update") - } - } -} - -// This test verifies that when the first node is restarted and has no valid -// join flags or persisted Gossip bootstrap records (achieved through restarting -// the other nodes and thus randomizing their ports), it will still be able to -// accept incoming Gossip connections and manages to bootstrap that way. -// -// See https://github.com/cockroachdb/cockroach/issues/18027. -func TestGossipRestartFirstNodeNeedsIncoming(t *testing.T) { - s := log.Scope(t) - defer s.Close(t) - - ctx := context.Background() - cfg := ReadConfigFromFlags() - RunLocal(t, func(t *testing.T) { - c := StartCluster(ctx, t, cfg) - defer c.AssertAndStop(ctx, t) - - testGossipRestartFirstNodeNeedsIncomingInner(ctx, t, c, cfg) - }) -} - -func testGossipRestartFirstNodeNeedsIncomingInner( - ctx context.Context, t *testing.T, c cluster.Cluster, cfg cluster.TestConfig, -) { - num := c.NumNodes() - - if err := c.Kill(ctx, 0); err != nil { - t.Fatal(err) - } - - // Configure the first node to repel all replicas. - - lc := c.(*localcluster.LocalCluster) - lc.Nodes[0].Cfg.ExtraArgs = append(lc.Nodes[0].Cfg.ExtraArgs, "--attrs", "empty") - - if err := c.Restart(ctx, 0); err != nil { - t.Fatal(err) - } - - const zoneCfg = "constraints: [-empty]" - zoneFile := filepath.Join(lc.Nodes[0].Cfg.DataDir, ".customzone") - - if err := ioutil.WriteFile(zoneFile, []byte(zoneCfg), 0644); err != nil { - t.Fatal(err) - } - - for _, zone := range []string{".default", ".meta", ".liveness"} { - if _, _, err := c.ExecCLI(ctx, 0, []string{"zone", "set", zone, "-f", zoneFile}); err != nil { - t.Fatal(err) - } - } - - db := makePGClient(t, c.PGUrl(ctx, 0)) - defer db.Close() - - // NB: This was flaky with `SucceedsSoon`. - if err := retry.ForDuration(2*time.Minute, func() error { - const query = "SELECT count(replicas) FROM crdb_internal.ranges WHERE array_position(replicas, 1) IS NOT NULL" - var count int - if err := db.QueryRow(query).Scan(&count); err != nil { - t.Fatal(err) - } - if count > 0 { - err := errors.Errorf("first node still has %d replicas", count) - log.Info(ctx, err) - return err - } - return nil - }); err != nil { - t.Fatal(err) - } - - log.Infof(ctx, "killing all nodes") - for i := 0; i < num; i++ { - if err := c.Kill(ctx, i); err != nil { - t.Fatal(err) - } - } - - log.Infof(ctx, "restarting only first node so that it writes its advertised address") - - // Note that at this point, the first node has no join flags and no useful - // persisted Gossip info (since all other nodes are down and they'll get new - // ports when they respawn). - chs := []<-chan error{lc.RestartAsync(ctx, 0)} - - testutils.SucceedsSoon(t, func() error { - if lc.Nodes[0].AdvertiseAddr() != "" { - return nil - } - return errors.New("no advertised addr yet") - }) - - // The other nodes will be joined to the first node, so they can - // reach out to it (and in fact they will, since that's the only - // reachable peer they have -- their persisted info is outdated, too!) - log.Infof(ctx, "restarting the other nodes") - for i := 1; i < num; i++ { - chs = append(chs, lc.RestartAsync(ctx, i)) - } - - log.Infof(ctx, "waiting for healthy cluster") - for i, ch := range chs { - if err := <-ch; err != nil { - t.Errorf("error restarting node %d: %s", i, err) - } - } - - testClusterConnectedAndFunctional(ctx, t, c) -} diff --git a/pkg/acceptance/util_gossip.go b/pkg/acceptance/util_gossip.go deleted file mode 100644 index 53c0de423b37..000000000000 --- a/pkg/acceptance/util_gossip.go +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2015 The Cockroach Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the License. - -package acceptance - -import ( - "context" - "strings" - "time" - - "github.com/cockroachdb/cockroach/pkg/acceptance/cluster" - "github.com/cockroachdb/cockroach/pkg/gossip" - "github.com/cockroachdb/cockroach/pkg/util/httputil" - "github.com/cockroachdb/cockroach/pkg/util/retry" - "github.com/pkg/errors" -) - -// CheckGossipFunc is the type of callback used in CheckGossip. -type CheckGossipFunc func(map[string]gossip.Info) error - -// CheckGossip fetches the gossip infoStore from each node and invokes the given -// function. The test passes if the function returns 0 for every node, -// retrying for up to the given duration. -func CheckGossip(ctx context.Context, c cluster.Cluster, d time.Duration, f CheckGossipFunc) error { - return errors.Wrapf(retry.ForDuration(d, func() error { - var infoStatus gossip.InfoStatus - for i := 0; i < c.NumNodes(); i++ { - if err := httputil.GetJSON(cluster.HTTPClient, c.URL(ctx, i)+"/_status/gossip/local", &infoStatus); err != nil { - return errors.Wrapf(err, "failed to get gossip status from node %d", i) - } - if err := f(infoStatus.Infos); err != nil { - return errors.Wrapf(err, "node %d", i) - } - } - - return nil - }), "condition failed to evaluate within %s", d) -} - -// HasPeers returns a CheckGossipFunc that passes when the given -// number of peers are connected via gossip. -func HasPeers(expected int) CheckGossipFunc { - return func(infos map[string]gossip.Info) error { - count := 0 - for k := range infos { - if strings.HasPrefix(k, "node:") { - count++ - } - } - if count != expected { - return errors.Errorf("expected %d peers, found %d", expected, count) - } - return nil - } -} - -// hasSentinel is a checkGossipFunc that passes when the sentinel gossip is present. -func hasSentinel(infos map[string]gossip.Info) error { - if _, ok := infos[gossip.KeySentinel]; !ok { - return errors.Errorf("sentinel not found") - } - return nil -} - -// hasClusterID is a checkGossipFunc that passes when the cluster ID gossip is present. -func hasClusterID(infos map[string]gossip.Info) error { - if _, ok := infos[gossip.KeyClusterID]; !ok { - return errors.Errorf("cluster ID not found") - } - return nil -} diff --git a/pkg/cmd/roachtest/acceptance.go b/pkg/cmd/roachtest/acceptance.go index aca98a5a05d8..6e7e9050ffec 100644 --- a/pkg/cmd/roachtest/acceptance.go +++ b/pkg/cmd/roachtest/acceptance.go @@ -38,7 +38,7 @@ func registerAcceptance(r *registry) { {"event-log", runEventLog}, {"gossip/peerings", runGossipPeerings}, {"gossip/restart", runGossipRestart}, - {"gossip/restart-first-node-needs-incoming", runGossipRestartFirstNodeNeedsIncoming}, + {"gossip/restart-node-one", runGossipRestartNodeOne}, {"rapid-restart", runRapidRestart}, {"status-server", runStatusServer}, } diff --git a/pkg/cmd/roachtest/gossip.go b/pkg/cmd/roachtest/gossip.go index a09027604101..e8021becb40a 100644 --- a/pkg/cmd/roachtest/gossip.go +++ b/pkg/cmd/roachtest/gossip.go @@ -19,7 +19,10 @@ import ( "context" gosql "database/sql" "fmt" + "net" "net/http" + "net/url" + "strconv" "strings" "time" "unicode" @@ -126,16 +129,18 @@ SELECT string_agg(source_id::TEXT || ':' || target_id::TEXT, ',') type gossipUtil struct { waitTime time.Duration urlMap map[int]string + conn func(ctx context.Context, i int) *gosql.DB } -func newGossipUtil(ctx context.Context, c *cluster, waitTime time.Duration) *gossipUtil { +func newGossipUtil(ctx context.Context, c *cluster) *gossipUtil { urlMap := make(map[int]string) for i, addr := range c.ExternalAdminUIAddr(ctx, c.All()) { urlMap[i+1] = `http://` + addr } return &gossipUtil{ - waitTime: waitTime, + waitTime: 30 * time.Second, urlMap: urlMap, + conn: c.Conn, } } @@ -207,7 +212,7 @@ func (g *gossipUtil) checkConnectedAndFunctional(ctx context.Context, t *test, c } for i := 1; i <= c.nodes; i++ { - db := c.Conn(ctx, i) + db := g.conn(ctx, i) defer db.Close() if i == 1 { if _, err := db.Exec("CREATE DATABASE IF NOT EXISTS test"); err != nil { @@ -246,7 +251,7 @@ func runGossipPeerings(ctx context.Context, t *test, c *cluster) { // Repeatedly restart a random node and verify that all of the nodes are // seeing the gossiped values. - g := newGossipUtil(ctx, c, 30*time.Second) + g := newGossipUtil(ctx, c) deadline := timeutil.Now().Add(time.Minute) for i := 1; timeutil.Now().Before(deadline); i++ { @@ -278,7 +283,7 @@ func runGossipRestart(ctx context.Context, t *test, c *cluster) { // which is required for any node to be able do even the most basic // operations on a cluster. - g := newGossipUtil(ctx, c, 30*time.Second) + g := newGossipUtil(ctx, c) deadline := timeutil.Now().Add(time.Minute) for i := 1; timeutil.Now().Before(deadline); i++ { @@ -293,5 +298,98 @@ func runGossipRestart(ctx context.Context, t *test, c *cluster) { } } -func runGossipRestartFirstNodeNeedsIncoming(ctx context.Context, t *test, c *cluster) { +func runGossipRestartNodeOne(ctx context.Context, t *test, c *cluster) { + c.Put(ctx, cockroach, "./cockroach") + c.Start(ctx, racks(c.nodes)) + + db := c.Conn(ctx, 1) + defer db.Close() + + run := func(stmt string) { + c.l.Printf("%s\n", stmt) + if _, err := db.ExecContext(ctx, stmt); err != nil { + t.Fatal(err) + } + } + + // Evacuate all of the ranges off node 1 with zone config constraints. See + // the racks setting specified when the cluster was started. + run(`ALTER RANGE default EXPERIMENTAL CONFIGURE ZONE 'constraints: {"-rack=0"}'`) + run(`ALTER RANGE meta EXPERIMENTAL CONFIGURE ZONE 'constraints: {"-rack=0"}'`) + run(`ALTER RANGE liveness EXPERIMENTAL CONFIGURE ZONE 'constraints: {"-rack=0"}'`) + + var lastCount int + if err := retry.ForDuration(2*time.Minute, func() error { + const query = ` +SELECT count(replicas) + FROM crdb_internal.ranges + WHERE array_position(replicas, 1) IS NOT NULL +` + var count int + if err := db.QueryRow(query).Scan(&count); err != nil { + t.Fatal(err) + } + if count > 0 { + err := errors.Errorf("node 1 still has %d replicas", count) + if count != lastCount { + lastCount = count + c.l.Printf("%s\n", err) + } + return err + } + return nil + }); err != nil { + t.Fatal(err) + } + + c.l.Printf("killing all nodes\n") + c.Stop(ctx) + + // Restart node 1, but have it listen on a different port for internal + // connections. This will require node 1 to reach out to the other nodes in + // the cluster for gossip info. + err := c.RunE(ctx, c.Node(1), + `./cockroach start --insecure --background --store={store-dir} `+ + `--log-dir={log-dir} --cache=10% --max-sql-memory=10% `+ + `--listen-addr=:$[{pgport:1}+10000] --http-port=$[{pgport:1}+1] `+ + `> {log-dir}/cockroach.stdout 2> {log-dir}/cockroach.stderr`) + if err != nil { + t.Fatal(err) + } + + // Restart the other nodes. These nodes won't be able to talk to node 1 until + // node 1 talks to it (they have out of date address info). Node 1 needs + // incoming gossip info in order to determine where range 1 is. + c.Start(ctx, c.Range(2, c.nodes)) + + // We need to override DB connection creation to use the correct port for + // node 1. This is more complicated than it should be and a limitation of the + // current infrastructure which doesn't know about cockroach nodes started on + // non-standard ports. + g := newGossipUtil(ctx, c) + g.conn = func(ctx context.Context, i int) *gosql.DB { + if i != 1 { + return c.Conn(ctx, i) + } + url, err := url.Parse(c.ExternalPGUrl(ctx, c.Node(1))[0]) + if err != nil { + t.Fatal(err) + } + host, port, err := net.SplitHostPort(url.Host) + if err != nil { + t.Fatal(err) + } + v, err := strconv.Atoi(port) + if err != nil { + t.Fatal(err) + } + url.Host = fmt.Sprintf("%s:%d", host, v+10000) + db, err := gosql.Open("postgres", url.String()) + if err != nil { + t.Fatal(err) + } + return db + } + + g.checkConnectedAndFunctional(ctx, t, c) }