diff --git a/pkg/cli/debug.go b/pkg/cli/debug.go index d10a0afdf03c..a0c69825f144 100644 --- a/pkg/cli/debug.go +++ b/pkg/cli/debug.go @@ -1073,8 +1073,6 @@ func parseGossipValues(gossipInfo *gossip.InfoStatus) (string, error) { return "", errors.Wrapf(err, "failed to parse value for key %q", key) } output = append(output, fmt.Sprintf("%q: %+v", key, drainingInfo)) - } else if strings.HasPrefix(key, gossip.KeyGossipClientsPrefix) { - output = append(output, fmt.Sprintf("%q: %v", key, string(bytes))) } } diff --git a/pkg/cli/testdata/zip/partial1 b/pkg/cli/testdata/zip/partial1 index f93018abf1c5..b18bf3648319 100644 --- a/pkg/cli/testdata/zip/partial1 +++ b/pkg/cli/testdata/zip/partial1 @@ -78,7 +78,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0s /dev/null [node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... done -[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... done [node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done [node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done @@ -179,9 +178,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0s /dev/null [node 2] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/2/crdb_internal.gossip_liveness.txt... [node 2] retrieving SQL data for crdb_internal.gossip_liveness: last request failed: dial tcp ... [node 2] retrieving SQL data for crdb_internal.gossip_liveness: creating error output: debug/nodes/2/crdb_internal.gossip_liveness.txt.err.txt... done -[node 2] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/2/crdb_internal.gossip_network.txt... -[node 2] retrieving SQL data for crdb_internal.gossip_network: last request failed: dial tcp ... -[node 2] retrieving SQL data for crdb_internal.gossip_network: creating error output: debug/nodes/2/crdb_internal.gossip_network.txt.err.txt... done [node 2] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/2/crdb_internal.gossip_nodes.txt... [node 2] retrieving SQL data for crdb_internal.gossip_nodes: last request failed: dial tcp ... [node 2] retrieving SQL data for crdb_internal.gossip_nodes: creating error output: debug/nodes/2/crdb_internal.gossip_nodes.txt.err.txt... done @@ -263,7 +259,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0s /dev/null [node 3] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/3/crdb_internal.feature_usage.txt... done [node 3] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/3/crdb_internal.gossip_alerts.txt... done [node 3] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/3/crdb_internal.gossip_liveness.txt... done -[node 3] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/3/crdb_internal.gossip_network.txt... done [node 3] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/3/crdb_internal.gossip_nodes.txt... done [node 3] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/3/crdb_internal.leases.txt... done [node 3] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/3/crdb_internal.node_build_info.txt... done diff --git a/pkg/cli/testdata/zip/partial1_excluded b/pkg/cli/testdata/zip/partial1_excluded index d882b90e92a2..0654e9d7d724 100644 --- a/pkg/cli/testdata/zip/partial1_excluded +++ b/pkg/cli/testdata/zip/partial1_excluded @@ -78,7 +78,6 @@ debug zip /dev/null --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0 [node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... done -[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... done [node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done [node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done @@ -172,7 +171,6 @@ debug zip /dev/null --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0 [node 3] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/3/crdb_internal.feature_usage.txt... done [node 3] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/3/crdb_internal.gossip_alerts.txt... done [node 3] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/3/crdb_internal.gossip_liveness.txt... done -[node 3] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/3/crdb_internal.gossip_network.txt... done [node 3] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/3/crdb_internal.gossip_nodes.txt... done [node 3] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/3/crdb_internal.leases.txt... done [node 3] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/3/crdb_internal.node_build_info.txt... done diff --git a/pkg/cli/testdata/zip/partial2 b/pkg/cli/testdata/zip/partial2 index 38c20451217f..421a4dd75551 100644 --- a/pkg/cli/testdata/zip/partial2 +++ b/pkg/cli/testdata/zip/partial2 @@ -78,7 +78,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0 /dev/null [node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... done -[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... done [node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done [node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done @@ -171,7 +170,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0 /dev/null [node 3] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/3/crdb_internal.feature_usage.txt... done [node 3] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/3/crdb_internal.gossip_alerts.txt... done [node 3] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/3/crdb_internal.gossip_liveness.txt... done -[node 3] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/3/crdb_internal.gossip_network.txt... done [node 3] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/3/crdb_internal.gossip_nodes.txt... done [node 3] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/3/crdb_internal.leases.txt... done [node 3] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/3/crdb_internal.node_build_info.txt... done diff --git a/pkg/cli/testdata/zip/testzip b/pkg/cli/testdata/zip/testzip index 7a8d72cb1ca5..a38ab6229695 100644 --- a/pkg/cli/testdata/zip/testzip +++ b/pkg/cli/testdata/zip/testzip @@ -81,7 +81,6 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null [node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... done -[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... done [node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done [node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done diff --git a/pkg/cli/testdata/zip/testzip_concurrent b/pkg/cli/testdata/zip/testzip_concurrent index dbf587a74982..fb49a3865560 100644 --- a/pkg/cli/testdata/zip/testzip_concurrent +++ b/pkg/cli/testdata/zip/testzip_concurrent @@ -292,9 +292,6 @@ zip [node 1] retrieving SQL data for crdb_internal.gossip_liveness... [node 1] retrieving SQL data for crdb_internal.gossip_liveness: done [node 1] retrieving SQL data for crdb_internal.gossip_liveness: writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... -[node 1] retrieving SQL data for crdb_internal.gossip_network... -[node 1] retrieving SQL data for crdb_internal.gossip_network: done -[node 1] retrieving SQL data for crdb_internal.gossip_network: writing output: debug/nodes/1/crdb_internal.gossip_network.txt... [node 1] retrieving SQL data for crdb_internal.gossip_nodes... [node 1] retrieving SQL data for crdb_internal.gossip_nodes: done [node 1] retrieving SQL data for crdb_internal.gossip_nodes: writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... @@ -622,9 +619,6 @@ zip [node 2] retrieving SQL data for crdb_internal.gossip_liveness... [node 2] retrieving SQL data for crdb_internal.gossip_liveness: done [node 2] retrieving SQL data for crdb_internal.gossip_liveness: writing output: debug/nodes/2/crdb_internal.gossip_liveness.txt... -[node 2] retrieving SQL data for crdb_internal.gossip_network... -[node 2] retrieving SQL data for crdb_internal.gossip_network: done -[node 2] retrieving SQL data for crdb_internal.gossip_network: writing output: debug/nodes/2/crdb_internal.gossip_network.txt... [node 2] retrieving SQL data for crdb_internal.gossip_nodes... [node 2] retrieving SQL data for crdb_internal.gossip_nodes: done [node 2] retrieving SQL data for crdb_internal.gossip_nodes: writing output: debug/nodes/2/crdb_internal.gossip_nodes.txt... @@ -952,9 +946,6 @@ zip [node 3] retrieving SQL data for crdb_internal.gossip_liveness... [node 3] retrieving SQL data for crdb_internal.gossip_liveness: done [node 3] retrieving SQL data for crdb_internal.gossip_liveness: writing output: debug/nodes/3/crdb_internal.gossip_liveness.txt... -[node 3] retrieving SQL data for crdb_internal.gossip_network... -[node 3] retrieving SQL data for crdb_internal.gossip_network: done -[node 3] retrieving SQL data for crdb_internal.gossip_network: writing output: debug/nodes/3/crdb_internal.gossip_network.txt... [node 3] retrieving SQL data for crdb_internal.gossip_nodes... [node 3] retrieving SQL data for crdb_internal.gossip_nodes: done [node 3] retrieving SQL data for crdb_internal.gossip_nodes: writing output: debug/nodes/3/crdb_internal.gossip_nodes.txt... diff --git a/pkg/cli/testdata/zip/testzip_tenant b/pkg/cli/testdata/zip/testzip_tenant index ceab965fade4..7ca3c80c608d 100644 --- a/pkg/cli/testdata/zip/testzip_tenant +++ b/pkg/cli/testdata/zip/testzip_tenant @@ -110,9 +110,6 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null [node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... [node 1] retrieving SQL data for crdb_internal.gossip_liveness: last request failed: ERROR: unimplemented: operation is unsupported in multi-tenancy mode (SQLSTATE 0A000) [node 1] retrieving SQL data for crdb_internal.gossip_liveness: creating error output: debug/nodes/1/crdb_internal.gossip_liveness.txt.err.txt... done -[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt... -[node 1] retrieving SQL data for crdb_internal.gossip_network: last request failed: ERROR: unimplemented: operation is unsupported in multi-tenancy mode (SQLSTATE 0A000) -[node 1] retrieving SQL data for crdb_internal.gossip_network: creating error output: debug/nodes/1/crdb_internal.gossip_network.txt.err.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... [node 1] retrieving SQL data for crdb_internal.gossip_nodes: last request failed: ERROR: unimplemented: operation is unsupported in multi-tenancy mode (SQLSTATE 0A000) [node 1] retrieving SQL data for crdb_internal.gossip_nodes: creating error output: debug/nodes/1/crdb_internal.gossip_nodes.txt.err.txt... done diff --git a/pkg/cli/testdata/zip/unavailable b/pkg/cli/testdata/zip/unavailable index f48e3c425dab..a25c97f0c941 100644 --- a/pkg/cli/testdata/zip/unavailable +++ b/pkg/cli/testdata/zip/unavailable @@ -90,7 +90,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0 /dev/null --timeout=.5s [node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... done -[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt... done [node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... done [node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done [node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done diff --git a/pkg/cli/zip_table_registry.go b/pkg/cli/zip_table_registry.go index 631402fff76c..8bb5c171f224 100644 --- a/pkg/cli/zip_table_registry.go +++ b/pkg/cli/zip_table_registry.go @@ -533,12 +533,6 @@ var zipInternalTablesPerNode = DebugZipTableRegistry{ "updated_at", }, }, - "crdb_internal.gossip_network": { - nonSensitiveCols: NonSensitiveColumns{ - "source_id", - "target_id", - }, - }, "crdb_internal.gossip_nodes": { // `cluster_name` is hashed as we only care to see whether values are // identical across nodes. diff --git a/pkg/cli/zip_test.go b/pkg/cli/zip_test.go index eb743b1098ec..513ac0edd491 100644 --- a/pkg/cli/zip_test.go +++ b/pkg/cli/zip_test.go @@ -76,6 +76,7 @@ table_name NOT IN ( 'cross_db_references', 'databases', 'forward_dependencies', + 'gossip_network', 'index_columns', 'kv_catalog_comments', 'kv_catalog_descriptor', diff --git a/pkg/cmd/roachtest/tests/cli.go b/pkg/cmd/roachtest/tests/cli.go index a76ebdd54eb3..7bd12b7b5ba6 100644 --- a/pkg/cmd/roachtest/tests/cli.go +++ b/pkg/cmd/roachtest/tests/cli.go @@ -109,7 +109,7 @@ func runCLINodeStatus(ctx context.Context, t test.Test, c cluster.Cluster) { c.Stop(ctx, t.L(), option.DefaultStopOpts(), c.Node(3)) waitUntil([]string{ "is_available is_live", - "false true", + "false false", "false false", "false false", }) diff --git a/pkg/cmd/roachtest/tests/gossip.go b/pkg/cmd/roachtest/tests/gossip.go index eccf93bce211..cf8bc936a1f6 100644 --- a/pkg/cmd/roachtest/tests/gossip.go +++ b/pkg/cmd/roachtest/tests/gossip.go @@ -21,7 +21,6 @@ import ( "strconv" "strings" "time" - "unicode" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" @@ -47,52 +46,36 @@ func registerGossip(r registry.Registry) { err := WaitFor3XReplication(ctx, t, c.Conn(ctx, t.L(), 1)) require.NoError(t, err) - // TODO(irfansharif): We could also look at gossip_liveness to determine - // cluster membership as seen by each gossip module, and ensure each - // node's gossip excludes the dead node and includes all other live - // ones. - gossipNetworkAccordingTo := func(node int) (network string) { + gossipNetworkAccordingTo := func(node int) (nodes []int) { + // Expiration timestamp format: .,. We'll + // capture just the portion. const query = ` -SELECT string_agg(source_id::TEXT || ':' || target_id::TEXT, ',') - FROM (SELECT * FROM crdb_internal.gossip_network ORDER BY source_id, target_id) +SELECT node_id + FROM (SELECT node_id, to_timestamp(split_part(split_part(expiration, ',', 1), '.', 1)::FLOAT8) AS expiration + FROM crdb_internal.gossip_liveness) + WHERE expiration > now(); ` db := c.Conn(ctx, t.L(), node) defer db.Close() - var s gosql.NullString - if err = db.QueryRow(query).Scan(&s); err != nil { - t.Fatal(err) - } - if s.Valid { - return s.String - } - return "" - } - nodesInNetworkAccordingTo := func(node int) (nodes []int, network string) { - split := func(c rune) bool { - return !unicode.IsNumber(c) + rows, err := db.Query(query) + if err != nil { + t.Fatal(err) } - uniqueNodes := make(map[int]struct{}) - network = gossipNetworkAccordingTo(node) - for _, idStr := range strings.FieldsFunc(network, split) { - nodeID, err := strconv.Atoi(idStr) - if err != nil { - t.Fatal(err) - } - uniqueNodes[nodeID] = struct{}{} - } - for node := range uniqueNodes { - nodes = append(nodes, node) + for rows.Next() { + var nodeID int + require.NoError(t, rows.Scan(&nodeID)) + require.NotZero(t, nodeID) + nodes = append(nodes, nodeID) } sort.Ints(nodes) - return nodes, network + return nodes } gossipOK := func(start time.Time, deadNode int) bool { var expLiveNodes []int - var expGossipNetwork string for i := 1; i <= c.Spec().NodeCount; i++ { if elapsed := timeutil.Since(start); elapsed >= 20*time.Second { @@ -104,36 +87,35 @@ SELECT string_agg(source_id::TEXT || ':' || target_id::TEXT, ',') } t.L().Printf("%d: checking gossip\n", i) - liveNodes, gossipNetwork := nodesInNetworkAccordingTo(i) + liveNodes := gossipNetworkAccordingTo(i) for _, id := range liveNodes { if id == deadNode { - t.L().Printf("%d: gossip not ok (dead node %d present): %s (%.0fs)\n", - i, deadNode, gossipNetwork, timeutil.Since(start).Seconds()) + t.L().Printf("%d: gossip not ok (dead node %d present) (%.0fs)\n", + i, deadNode, timeutil.Since(start).Seconds()) return false } } if len(expLiveNodes) == 0 { expLiveNodes = liveNodes - expGossipNetwork = gossipNetwork continue } if len(liveNodes) != len(expLiveNodes) { - t.L().Printf("%d: gossip not ok (mismatched size of network: %s); expected %d, got %d (%.0fs)\n", - i, gossipNetwork, len(expLiveNodes), len(liveNodes), timeutil.Since(start).Seconds()) + t.L().Printf("%d: gossip not ok (mismatched size of network); expected %d, got %d (%.0fs)\n", + i, len(expLiveNodes), len(liveNodes), timeutil.Since(start).Seconds()) return false } for i := range liveNodes { if liveNodes[i] != expLiveNodes[i] { t.L().Printf("%d: gossip not ok (mismatched view of live nodes); expected %s, got %s (%.0fs)\n", - i, gossipNetwork, expLiveNodes, liveNodes, timeutil.Since(start).Seconds()) + i, expLiveNodes, liveNodes, timeutil.Since(start).Seconds()) return false } } } - t.L().Printf("gossip ok: %s (size: %d) (%0.0fs)\n", expGossipNetwork, len(expLiveNodes), timeutil.Since(start).Seconds()) + t.L().Printf("gossip ok (size: %d) (%0.0fs)\n", len(expLiveNodes), timeutil.Since(start).Seconds()) return true } diff --git a/pkg/gossip/client.go b/pkg/gossip/client.go index 6ffb4c5acf2b..a279c00a0860 100644 --- a/pkg/gossip/client.go +++ b/pkg/gossip/client.go @@ -333,7 +333,6 @@ func (c *client) gossip( } if peerID == 0 && c.peerID != 0 { peerID = c.peerID - g.updateClients() } } }() diff --git a/pkg/gossip/gossip.go b/pkg/gossip/gossip.go index 624808418c90..20bc448f3e43 100644 --- a/pkg/gossip/gossip.go +++ b/pkg/gossip/gossip.go @@ -47,14 +47,11 @@ the system with minimal total hops. The algorithm is as follows: package gossip import ( - "bytes" "context" "fmt" "math" "math/rand" "net" - "sort" - "strconv" "strings" "sync" "time" @@ -108,12 +105,6 @@ const ( // efficiently targeted connection to the most distant node. defaultCullInterval = 60 * time.Second - // defaultClientsInterval is the default interval for updating the gossip - // clients key which allows every node in the cluster to create a map of - // gossip connectivity. This value is intentionally small as we want to - // detect gossip partitions faster that the node liveness timeout (9s). - defaultClientsInterval = 2 * time.Second - // NodeDescriptorInterval is the interval for gossiping the node descriptor. // Note that increasing this duration may increase the likelihood of gossip // thrashing, since node descriptors are used to determine the number of gossip @@ -280,8 +271,6 @@ type Gossip struct { locality roachpb.Locality - lastConnectivity redact.RedactableString - defaultZoneConfig *zonepb.ZoneConfig } @@ -340,7 +329,6 @@ func New( // Add ourselves as a node descriptor watcher. g.mu.is.registerCallback(MakePrefixPattern(KeyNodeDescPrefix), g.updateNodeAddress) g.mu.is.registerCallback(MakePrefixPattern(KeyStoreDescPrefix), g.updateStoreMap) - // Log gossip connectivity whenever we receive an update. g.mu.Unlock() if grpcServer != nil { @@ -415,7 +403,6 @@ func (g *Gossip) SetNodeDescriptor(desc *roachpb.NodeDescriptor) error { if err := g.AddInfoProto(MakeNodeIDKey(desc.NodeID), desc, NodeDescriptorTTL); err != nil { return errors.Wrapf(err, "n%d: couldn't gossip descriptor", desc.NodeID) } - g.updateClients() return nil } @@ -596,17 +583,10 @@ func (g *Gossip) LogStatus() { } g.mu.RUnlock() - var connectivity redact.RedactableString - if s := redact.Sprint(g.Connectivity()); s != g.lastConnectivity { - g.lastConnectivity = s - connectivity = s - } - ctx := g.AnnotateCtx(context.TODO()) - log.Health.Infof(ctx, "gossip status (%s, %d node%s)\n%s%s%s", + log.Health.Infof(ctx, "gossip status (%s, %d node%s)\n%s%s", status, n, util.Pluralize(int64(n)), - g.clientStatus(), g.server.status(), - connectivity) + g.clientStatus(), g.server.status()) } func (g *Gossip) clientStatus() ClientStatus { @@ -632,63 +612,6 @@ func (g *Gossip) clientStatus() ClientStatus { return status } -// Connectivity returns the current view of the gossip network as seen by this -// node. -func (g *Gossip) Connectivity() Connectivity { - ctx := g.AnnotateCtx(context.TODO()) - var c Connectivity - - g.mu.RLock() - - if i := g.mu.is.getInfo(KeySentinel); i != nil { - c.SentinelNodeID = i.NodeID - } - - g.nodeDescs.Range(func(nodeID int64, _ unsafe.Pointer) bool { - key := MakeGossipClientsKey(roachpb.NodeID(nodeID)) - i := g.mu.is.getInfo(key) - if i == nil { - return true - } - - v, err := i.Value.GetBytes() - if err != nil { - log.Errorf(ctx, "unable to retrieve gossip value for %s: %v", key, err) - return true - } - if len(v) == 0 { - return true - } - - for _, part := range strings.Split(string(v), ",") { - id, err := strconv.ParseInt(part, 10 /* base */, 64 /* bitSize */) - if err != nil { - log.Errorf(ctx, "unable to parse node ID: %v", err) - } - c.ClientConns = append(c.ClientConns, Connectivity_Conn{ - SourceID: roachpb.NodeID(nodeID), - TargetID: roachpb.NodeID(id), - }) - } - return true - }) - - g.mu.RUnlock() - - sort.Slice(c.ClientConns, func(i, j int) bool { - a, b := &c.ClientConns[i], &c.ClientConns[j] - if a.SourceID < b.SourceID { - return true - } - if a.SourceID > b.SourceID { - return false - } - return a.TargetID < b.TargetID - }) - - return c -} - // EnableSimulationCycler is for TESTING PURPOSES ONLY. It sets a // condition variable which is signaled at each cycle of the // simulation via SimulationCycle(). The gossip server makes each @@ -919,31 +842,6 @@ func (g *Gossip) updateStoreMap(key string, content roachpb.Value) { g.storeDescs.Store(int64(desc.StoreID), unsafe.Pointer(&desc)) } -func (g *Gossip) updateClients() { - nodeID := g.NodeID.Get() - if nodeID == 0 { - return - } - - var buf bytes.Buffer - var sep string - - g.mu.RLock() - g.clientsMu.Lock() - for _, c := range g.clientsMu.clients { - if c.peerID != 0 { - fmt.Fprintf(&buf, "%s%d", sep, c.peerID) - sep = "," - } - } - g.clientsMu.Unlock() - g.mu.RUnlock() - - if err := g.AddInfo(MakeGossipClientsKey(nodeID), buf.Bytes(), 2*defaultClientsInterval); err != nil { - log.Errorf(g.AnnotateCtx(context.Background()), "%v", err) - } -} - // recomputeMaxPeersLocked recomputes max peers based on size of // network and set the max sizes for incoming and outgoing node sets. // @@ -1149,15 +1047,13 @@ func (g *Gossip) InfoOriginatedHere(key string) bool { func (g *Gossip) GetInfoStatus() InfoStatus { clientStatus := g.clientStatus() serverStatus := g.server.status() - connectivity := g.Connectivity() g.mu.RLock() defer g.mu.RUnlock() is := InfoStatus{ - Infos: make(map[string]Info), - Client: clientStatus, - Server: serverStatus, - Connectivity: connectivity, + Infos: make(map[string]Info), + Client: clientStatus, + Server: serverStatus, } for k, v := range g.mu.is.Infos { is.Infos[k] = *protoutil.Clone(v).(*Info) @@ -1407,14 +1303,11 @@ func (g *Gossip) bootstrap() { func (g *Gossip) manage() { ctx := g.AnnotateCtx(context.Background()) _ = g.server.stopper.RunAsyncTask(ctx, "gossip-manage", func(ctx context.Context) { - clientsTimer := timeutil.NewTimer() cullTimer := timeutil.NewTimer() stallTimer := timeutil.NewTimer() - defer clientsTimer.Stop() defer cullTimer.Stop() defer stallTimer.Stop() - clientsTimer.Reset(defaultClientsInterval) cullTimer.Reset(jitteredInterval(g.cullInterval)) stallTimer.Reset(jitteredInterval(g.stallInterval)) for { @@ -1425,10 +1318,6 @@ func (g *Gossip) manage() { g.doDisconnected(c) case <-g.tighten: g.tightenNetwork(ctx) - case <-clientsTimer.C: - clientsTimer.Read = true - g.updateClients() - clientsTimer.Reset(defaultClientsInterval) case <-cullTimer.C: cullTimer.Read = true cullTimer.Reset(jitteredInterval(g.cullInterval)) @@ -1512,8 +1401,6 @@ func (g *Gossip) tightenNetwork(ctx context.Context) { } func (g *Gossip) doDisconnected(c *client) { - defer g.updateClients() - g.mu.Lock() defer g.mu.Unlock() g.removeClientLocked(c) diff --git a/pkg/gossip/gossip_test.go b/pkg/gossip/gossip_test.go index 51734f5458ce..92a6ebd6ad0e 100644 --- a/pkg/gossip/gossip_test.go +++ b/pkg/gossip/gossip_test.go @@ -817,7 +817,7 @@ func TestGossipPropagation(t *testing.T) { getInfo := func(g *Gossip, key string) *Info { g.mu.RLock() defer g.mu.RUnlock() - return g.mu.is.Infos[key] + return g.mu.is.getInfo(key) } var localInfo *Info diff --git a/pkg/gossip/keys.go b/pkg/gossip/keys.go index 34a2a5a53300..6b80da82b28d 100644 --- a/pkg/gossip/keys.go +++ b/pkg/gossip/keys.go @@ -83,11 +83,6 @@ const ( // KeyDistSQLDrainingPrefix is the key prefix for each node's DistSQL // draining state. KeyDistSQLDrainingPrefix = "distsql-draining" - - // KeyGossipClientsPrefix is the prefix for keys that indicate which gossip - // client connections a node has open. This is used by other nodes in the - // cluster to build a map of the gossip network. - KeyGossipClientsPrefix = "gossip-clients" ) // MakeKey creates a canonical key under which to gossip a piece of @@ -130,11 +125,6 @@ func DecodeNodeDescKey(key string, prefix string) (roachpb.NodeID, error) { return roachpb.NodeID(nodeID), nil } -// MakeGossipClientsKey returns the gossip client key for the given node. -func MakeGossipClientsKey(nodeID roachpb.NodeID) string { - return MakeKey(KeyGossipClientsPrefix, nodeID.String()) -} - // MakeNodeHealthAlertKey returns the gossip key under which the given node can // gossip health alerts. func MakeNodeHealthAlertKey(nodeID roachpb.NodeID) string { diff --git a/pkg/sql/crdb_internal.go b/pkg/sql/crdb_internal.go index 54b9d02ddf67..0d1abf4a4bb4 100644 --- a/pkg/sql/crdb_internal.go +++ b/pkg/sql/crdb_internal.go @@ -57,6 +57,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/parser" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" + "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgnotice" "github.com/cockroachdb/cockroach/pkg/sql/privilege" "github.com/cockroachdb/cockroach/pkg/sql/protoreflect" "github.com/cockroachdb/cockroach/pkg/sql/roleoption" @@ -4099,9 +4100,13 @@ CREATE TABLE crdb_internal.gossip_nodes ( } alive := make(map[roachpb.NodeID]tree.DBool) + now := timeutil.Now() for _, d := range descriptors { - if _, err := g.GetInfo(gossip.MakeGossipClientsKey(d.NodeID)); err == nil { - alive[d.NodeID] = true + var gossipLiveness livenesspb.Liveness + if err := g.GetInfoProto(gossip.MakeNodeLivenessKey(d.NodeID), &gossipLiveness); err == nil { + if now.Before(gossipLiveness.Expiration.ToTimestamp().GoTime()) { + alive[d.NodeID] = true + } } } @@ -4400,26 +4405,9 @@ CREATE TABLE crdb_internal.gossip_network ( source_id INT NOT NULL, -- source node of a gossip connection target_id INT NOT NULL -- target node of a gossip connection ) - `, +`, populate: func(ctx context.Context, p *planner, _ catalog.DatabaseDescriptor, addRow func(...tree.Datum) error) error { - if err := p.RequireAdminRole(ctx, "read crdb_internal.gossip_network"); err != nil { - return err - } - - g, err := p.ExecCfg().Gossip.OptionalErr(47899) - if err != nil { - return err - } - - c := g.Connectivity() - for _, conn := range c.ClientConns { - if err := addRow( - tree.NewDInt(tree.DInt(conn.SourceID)), - tree.NewDInt(tree.DInt(conn.TargetID)), - ); err != nil { - return err - } - } + p.BufferClientNotice(ctx, pgnotice.Newf("This table is no longer supported/populated, and will be removed in a future version.")) return nil }, } diff --git a/pkg/ui/workspaces/cluster-ui/src/statementDetails/statementDetails.tsx b/pkg/ui/workspaces/cluster-ui/src/statementDetails/statementDetails.tsx index 5ca0f8bb5de6..2f1e6816483c 100644 --- a/pkg/ui/workspaces/cluster-ui/src/statementDetails/statementDetails.tsx +++ b/pkg/ui/workspaces/cluster-ui/src/statementDetails/statementDetails.tsx @@ -527,7 +527,9 @@ export class StatementDetails extends React.Component< (stats.nodes || []).map(node => node.toString()), ).sort(); const regions = unique( - (stats.nodes || []).map(node => nodeRegions[node.toString()]), + (stats.nodes || []) + .map(node => nodeRegions[node.toString()]) + .filter(r => r), // Remove undefined / unknown regions. ).sort(); const lastExec = diff --git a/pkg/ui/workspaces/cluster-ui/src/statementsTable/statementsTable.spec.tsx b/pkg/ui/workspaces/cluster-ui/src/statementsTable/statementsTable.spec.tsx new file mode 100644 index 000000000000..3538dc847188 --- /dev/null +++ b/pkg/ui/workspaces/cluster-ui/src/statementsTable/statementsTable.spec.tsx @@ -0,0 +1,60 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +import { assert } from "chai"; +import Long from "long"; +import { + AggregateStatistics, + populateRegionNodeForStatements, +} from "./statementsTable"; + +describe("populateRegionNodeForStatements", () => { + function statementWithNodeIDs(...nodeIDs: number[]): AggregateStatistics { + return { + aggregatedFingerprintID: "", + aggregatedFingerprintHexID: "", + label: "", + summary: "", + aggregatedTs: 0, + aggregationInterval: 0, + implicitTxn: false, + fullScan: false, + database: "", + applicationName: "", + stats: { nodes: nodeIDs.map(id => Long.fromInt(id)) }, + }; + } + + it("maps nodes to regions, sorted", () => { + const statement = statementWithNodeIDs(1, 2); + populateRegionNodeForStatements([statement], { + "1": "gcp-us-west1", + "2": "gcp-us-east1", + }); + assert.deepEqual(["gcp-us-east1", "gcp-us-west1"], statement.regions); + }); + + it("handles statements without nodes", () => { + const statement = statementWithNodeIDs(); + populateRegionNodeForStatements([statement], { + "1": "gcp-us-west1", + "2": "gcp-us-east1", + }); + assert.deepEqual(statement.regions, []); + }); + + it("excludes nodes whose region we don't know", () => { + const statement = statementWithNodeIDs(1, 2); + populateRegionNodeForStatements([statement], { + "1": "gcp-us-west1", + }); + assert.deepEqual(statement.regions, ["gcp-us-west1"]); + }); +}); diff --git a/pkg/ui/workspaces/cluster-ui/src/statementsTable/statementsTable.tsx b/pkg/ui/workspaces/cluster-ui/src/statementsTable/statementsTable.tsx index c8f440c8b41d..b0b8b60472f0 100644 --- a/pkg/ui/workspaces/cluster-ui/src/statementsTable/statementsTable.tsx +++ b/pkg/ui/workspaces/cluster-ui/src/statementsTable/statementsTable.tsx @@ -341,10 +341,13 @@ export function populateRegionNodeForStatements( // E.g. {"gcp-us-east1" : [1,3,4]} if (stmt.stats.nodes) { stmt.stats.nodes.forEach(node => { - if (Object.keys(regions).includes(nodeRegions[node.toString()])) { - regions[nodeRegions[node.toString()]].add(longToInt(node)); - } else { - regions[nodeRegions[node.toString()]] = new Set([longToInt(node)]); + const region = nodeRegions[node.toString()]; + if (region) { + if (Object.keys(regions).includes(region)) { + regions[region].add(longToInt(node)); + } else { + regions[region] = new Set([longToInt(node)]); + } } }); } diff --git a/pkg/ui/workspaces/cluster-ui/src/transactionsPage/transactionsPage.tsx b/pkg/ui/workspaces/cluster-ui/src/transactionsPage/transactionsPage.tsx index 21df50dc6328..c59b79dbed4c 100644 --- a/pkg/ui/workspaces/cluster-ui/src/transactionsPage/transactionsPage.tsx +++ b/pkg/ui/workspaces/cluster-ui/src/transactionsPage/transactionsPage.tsx @@ -404,8 +404,6 @@ export class TransactionsPage extends React.Component< const statements = data?.statements || []; const { filters } = this.state; - // If the cluster is a tenant cluster we don't show info - // about nodes/regions. const nodes = Object.keys(nodeRegions) .map(n => Number(n)) .sort(); diff --git a/pkg/ui/workspaces/cluster-ui/src/transactionsPage/utils.spec.ts b/pkg/ui/workspaces/cluster-ui/src/transactionsPage/utils.spec.ts index 1c23ce961822..1056c9275dfd 100644 --- a/pkg/ui/workspaces/cluster-ui/src/transactionsPage/utils.spec.ts +++ b/pkg/ui/workspaces/cluster-ui/src/transactionsPage/utils.spec.ts @@ -11,6 +11,7 @@ import { assert } from "chai"; import { filterTransactions, + generateRegion, getStatementsByFingerprintId, statementFingerprintIdsToText, } from "./utils"; @@ -19,6 +20,8 @@ import { data, nodeRegions } from "./transactions.fixture"; import Long from "long"; import * as protos from "@cockroachlabs/crdb-protobuf-client"; +type Statement = + protos.cockroach.server.serverpb.StatementsResponse.ICollectedStatementStatistics; type Transaction = protos.cockroach.server.serverpb.StatementsResponse.IExtendedCollectedTransactionStatistics; @@ -288,3 +291,47 @@ SELECT _`, ); }); }); + +describe("generateRegion", () => { + function transactionWithStatementFingerprintIDs( + ...ids: number[] + ): Transaction { + return { + stats_data: { + statement_fingerprint_ids: ids.map(id => Long.fromInt(id)), + }, + }; + } + + function statementWithFingerprintAndNodeIDs( + id: number, + ...nodeIDs: number[] + ): Statement { + return { + id: Long.fromInt(id), + stats: { nodes: nodeIDs.map(id => Long.fromInt(id)) }, + }; + } + + it("gathers up the list of regions for the transaction, sorted", () => { + assert.deepEqual( + generateRegion( + transactionWithStatementFingerprintIDs(42), + [statementWithFingerprintAndNodeIDs(42, 1, 2)], + { "1": "gcp-us-west1", "2": "gcp-us-east1" }, + ), + ["gcp-us-east1", "gcp-us-west1"], + ); + }); + + it("skips over nodes with unknown regions", () => { + assert.deepEqual( + generateRegion( + transactionWithStatementFingerprintIDs(42), + [statementWithFingerprintAndNodeIDs(42, 1, 2)], + { "1": "gcp-us-west1" }, + ), + ["gcp-us-west1"], + ); + }); +}); diff --git a/pkg/ui/workspaces/cluster-ui/src/transactionsPage/utils.ts b/pkg/ui/workspaces/cluster-ui/src/transactionsPage/utils.ts index 1bd1e5363485..382e7004389c 100644 --- a/pkg/ui/workspaces/cluster-ui/src/transactionsPage/utils.ts +++ b/pkg/ui/workspaces/cluster-ui/src/transactionsPage/utils.ts @@ -273,7 +273,9 @@ export const generateRegion = ( }); }); - return Array.from(regions).sort(); + return Array.from(regions) + .filter(r => r) // Remove undefined / unknown regions. + .sort(); }; /** diff --git a/pkg/util/leaktest/leaktest.go b/pkg/util/leaktest/leaktest.go index ecbb3434b11b..879e2547725e 100644 --- a/pkg/util/leaktest/leaktest.go +++ b/pkg/util/leaktest/leaktest.go @@ -58,6 +58,10 @@ func interestingGoroutines() map[int64]string { strings.Contains(stack, "sentry-go.(*HTTPTransport).worker") || // Seems to be gccgo specific. (runtime.Compiler == "gccgo" && strings.Contains(stack, "testing.T.Parallel")) || + // Ignore intentionally long-running logging goroutines that live for the + // duration of the process. + strings.Contains(stack, "log.flushDaemon") || + strings.Contains(stack, "log.signalFlusher") || // Below are the stacks ignored by the upstream leaktest code. strings.Contains(stack, "testing.Main(") || strings.Contains(stack, "testing.tRunner(") ||