Skip to content

Commit

Permalink
gossip: remove frequent gossiping of gossip client connections
Browse files Browse the repository at this point in the history
These gossip-clients keys make up two thirds or more of the gossip
traffic in various large clusters I've inspected, consuming almost an
entire CPU core in the worst case I've seen. They don't provide enough
value to justify that sort of ongoing cost, so this commit removes them
entirely as well as the periodic logging of the gossip network and the
crdb_internal.gossip_network table, both of which relied on them.

Release note (backward-incompatible change): We've stopped
supporting/populating the crdb_internal.gossip_network table. It was an
internal table with no API guarantees (so perhaps no meriting a release
note?).

Release note (performance improvement): Significantly reduced CPU usage
of the underlying gossip network in large clusters.
  • Loading branch information
a-robinson authored and adityamaru committed Dec 22, 2022
1 parent 7415357 commit 2f6b015
Show file tree
Hide file tree
Showing 17 changed files with 40 additions and 224 deletions.
2 changes: 0 additions & 2 deletions pkg/cli/debug.go
Original file line number Diff line number Diff line change
Expand Up @@ -1073,8 +1073,6 @@ func parseGossipValues(gossipInfo *gossip.InfoStatus) (string, error) {
return "", errors.Wrapf(err, "failed to parse value for key %q", key)
}
output = append(output, fmt.Sprintf("%q: %+v", key, drainingInfo))
} else if strings.HasPrefix(key, gossip.KeyGossipClientsPrefix) {
output = append(output, fmt.Sprintf("%q: %v", key, string(bytes)))
}
}

Expand Down
5 changes: 0 additions & 5 deletions pkg/cli/testdata/zip/partial1
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0s /dev/null
[node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... done
[node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done
[node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done
Expand Down Expand Up @@ -179,9 +178,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0s /dev/null
[node 2] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/2/crdb_internal.gossip_liveness.txt...
[node 2] retrieving SQL data for crdb_internal.gossip_liveness: last request failed: dial tcp ...
[node 2] retrieving SQL data for crdb_internal.gossip_liveness: creating error output: debug/nodes/2/crdb_internal.gossip_liveness.txt.err.txt... done
[node 2] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/2/crdb_internal.gossip_network.txt...
[node 2] retrieving SQL data for crdb_internal.gossip_network: last request failed: dial tcp ...
[node 2] retrieving SQL data for crdb_internal.gossip_network: creating error output: debug/nodes/2/crdb_internal.gossip_network.txt.err.txt... done
[node 2] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/2/crdb_internal.gossip_nodes.txt...
[node 2] retrieving SQL data for crdb_internal.gossip_nodes: last request failed: dial tcp ...
[node 2] retrieving SQL data for crdb_internal.gossip_nodes: creating error output: debug/nodes/2/crdb_internal.gossip_nodes.txt.err.txt... done
Expand Down Expand Up @@ -263,7 +259,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0s /dev/null
[node 3] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/3/crdb_internal.feature_usage.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/3/crdb_internal.gossip_alerts.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/3/crdb_internal.gossip_liveness.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/3/crdb_internal.gossip_network.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/3/crdb_internal.gossip_nodes.txt... done
[node 3] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/3/crdb_internal.leases.txt... done
[node 3] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/3/crdb_internal.node_build_info.txt... done
Expand Down
2 changes: 0 additions & 2 deletions pkg/cli/testdata/zip/partial1_excluded
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ debug zip /dev/null --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0
[node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... done
[node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done
[node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done
Expand Down Expand Up @@ -172,7 +171,6 @@ debug zip /dev/null --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0
[node 3] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/3/crdb_internal.feature_usage.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/3/crdb_internal.gossip_alerts.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/3/crdb_internal.gossip_liveness.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/3/crdb_internal.gossip_network.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/3/crdb_internal.gossip_nodes.txt... done
[node 3] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/3/crdb_internal.leases.txt... done
[node 3] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/3/crdb_internal.node_build_info.txt... done
Expand Down
2 changes: 0 additions & 2 deletions pkg/cli/testdata/zip/partial2
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0 /dev/null
[node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... done
[node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done
[node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done
Expand Down Expand Up @@ -171,7 +170,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0 /dev/null
[node 3] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/3/crdb_internal.feature_usage.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/3/crdb_internal.gossip_alerts.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/3/crdb_internal.gossip_liveness.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/3/crdb_internal.gossip_network.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/3/crdb_internal.gossip_nodes.txt... done
[node 3] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/3/crdb_internal.leases.txt... done
[node 3] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/3/crdb_internal.node_build_info.txt... done
Expand Down
1 change: 0 additions & 1 deletion pkg/cli/testdata/zip/testzip
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... done
[node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done
[node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done
Expand Down
9 changes: 0 additions & 9 deletions pkg/cli/testdata/zip/testzip_concurrent
Original file line number Diff line number Diff line change
Expand Up @@ -292,9 +292,6 @@ zip
[node 1] retrieving SQL data for crdb_internal.gossip_liveness...
[node 1] retrieving SQL data for crdb_internal.gossip_liveness: done
[node 1] retrieving SQL data for crdb_internal.gossip_liveness: writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt...
[node 1] retrieving SQL data for crdb_internal.gossip_network...
[node 1] retrieving SQL data for crdb_internal.gossip_network: done
[node 1] retrieving SQL data for crdb_internal.gossip_network: writing output: debug/nodes/1/crdb_internal.gossip_network.txt...
[node 1] retrieving SQL data for crdb_internal.gossip_nodes...
[node 1] retrieving SQL data for crdb_internal.gossip_nodes: done
[node 1] retrieving SQL data for crdb_internal.gossip_nodes: writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt...
Expand Down Expand Up @@ -622,9 +619,6 @@ zip
[node 2] retrieving SQL data for crdb_internal.gossip_liveness...
[node 2] retrieving SQL data for crdb_internal.gossip_liveness: done
[node 2] retrieving SQL data for crdb_internal.gossip_liveness: writing output: debug/nodes/2/crdb_internal.gossip_liveness.txt...
[node 2] retrieving SQL data for crdb_internal.gossip_network...
[node 2] retrieving SQL data for crdb_internal.gossip_network: done
[node 2] retrieving SQL data for crdb_internal.gossip_network: writing output: debug/nodes/2/crdb_internal.gossip_network.txt...
[node 2] retrieving SQL data for crdb_internal.gossip_nodes...
[node 2] retrieving SQL data for crdb_internal.gossip_nodes: done
[node 2] retrieving SQL data for crdb_internal.gossip_nodes: writing output: debug/nodes/2/crdb_internal.gossip_nodes.txt...
Expand Down Expand Up @@ -952,9 +946,6 @@ zip
[node 3] retrieving SQL data for crdb_internal.gossip_liveness...
[node 3] retrieving SQL data for crdb_internal.gossip_liveness: done
[node 3] retrieving SQL data for crdb_internal.gossip_liveness: writing output: debug/nodes/3/crdb_internal.gossip_liveness.txt...
[node 3] retrieving SQL data for crdb_internal.gossip_network...
[node 3] retrieving SQL data for crdb_internal.gossip_network: done
[node 3] retrieving SQL data for crdb_internal.gossip_network: writing output: debug/nodes/3/crdb_internal.gossip_network.txt...
[node 3] retrieving SQL data for crdb_internal.gossip_nodes...
[node 3] retrieving SQL data for crdb_internal.gossip_nodes: done
[node 3] retrieving SQL data for crdb_internal.gossip_nodes: writing output: debug/nodes/3/crdb_internal.gossip_nodes.txt...
Expand Down
3 changes: 0 additions & 3 deletions pkg/cli/testdata/zip/testzip_tenant
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,6 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt...
[node 1] retrieving SQL data for crdb_internal.gossip_liveness: last request failed: ERROR: unimplemented: operation is unsupported in multi-tenancy mode (SQLSTATE 0A000)
[node 1] retrieving SQL data for crdb_internal.gossip_liveness: creating error output: debug/nodes/1/crdb_internal.gossip_liveness.txt.err.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt...
[node 1] retrieving SQL data for crdb_internal.gossip_network: last request failed: ERROR: unimplemented: operation is unsupported in multi-tenancy mode (SQLSTATE 0A000)
[node 1] retrieving SQL data for crdb_internal.gossip_network: creating error output: debug/nodes/1/crdb_internal.gossip_network.txt.err.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt...
[node 1] retrieving SQL data for crdb_internal.gossip_nodes: last request failed: ERROR: unimplemented: operation is unsupported in multi-tenancy mode (SQLSTATE 0A000)
[node 1] retrieving SQL data for crdb_internal.gossip_nodes: creating error output: debug/nodes/1/crdb_internal.gossip_nodes.txt.err.txt... done
Expand Down
1 change: 0 additions & 1 deletion pkg/cli/testdata/zip/unavailable
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0 /dev/null --timeout=.5s
[node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... done
[node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done
[node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done
Expand Down
6 changes: 0 additions & 6 deletions pkg/cli/zip_table_registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -533,12 +533,6 @@ var zipInternalTablesPerNode = DebugZipTableRegistry{
"updated_at",
},
},
"crdb_internal.gossip_network": {
nonSensitiveCols: NonSensitiveColumns{
"source_id",
"target_id",
},
},
"crdb_internal.gossip_nodes": {
// `cluster_name` is hashed as we only care to see whether values are
// identical across nodes.
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/zip_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ table_name NOT IN (
'cross_db_references',
'databases',
'forward_dependencies',
'gossip_network',
'index_columns',
'kv_catalog_comments',
'kv_catalog_descriptor',
Expand Down
2 changes: 1 addition & 1 deletion pkg/cmd/roachtest/tests/cli.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ func runCLINodeStatus(ctx context.Context, t test.Test, c cluster.Cluster) {
c.Stop(ctx, t.L(), option.DefaultStopOpts(), c.Node(3))
waitUntil([]string{
"is_available is_live",
"false true",
"false false",
"false false",
"false false",
})
Expand Down
64 changes: 23 additions & 41 deletions pkg/cmd/roachtest/tests/gossip.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (
"strconv"
"strings"
"time"
"unicode"

"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
Expand All @@ -47,52 +46,36 @@ func registerGossip(r registry.Registry) {
err := WaitFor3XReplication(ctx, t, c.Conn(ctx, t.L(), 1))
require.NoError(t, err)

// TODO(irfansharif): We could also look at gossip_liveness to determine
// cluster membership as seen by each gossip module, and ensure each
// node's gossip excludes the dead node and includes all other live
// ones.
gossipNetworkAccordingTo := func(node int) (network string) {
gossipNetworkAccordingTo := func(node int) (nodes []int) {
// Expiration timestamp format: <seconds>.<nanos>,<logical>. We'll
// capture just the <seconds> portion.
const query = `
SELECT string_agg(source_id::TEXT || ':' || target_id::TEXT, ',')
FROM (SELECT * FROM crdb_internal.gossip_network ORDER BY source_id, target_id)
SELECT node_id
FROM (SELECT node_id, to_timestamp(split_part(split_part(expiration, ',', 1), '.', 1)::FLOAT8) AS expiration
FROM crdb_internal.gossip_liveness)
WHERE expiration > now();
`

db := c.Conn(ctx, t.L(), node)
defer db.Close()
var s gosql.NullString
if err = db.QueryRow(query).Scan(&s); err != nil {
t.Fatal(err)
}
if s.Valid {
return s.String
}
return ""
}

nodesInNetworkAccordingTo := func(node int) (nodes []int, network string) {
split := func(c rune) bool {
return !unicode.IsNumber(c)
rows, err := db.Query(query)
if err != nil {
t.Fatal(err)
}

uniqueNodes := make(map[int]struct{})
network = gossipNetworkAccordingTo(node)
for _, idStr := range strings.FieldsFunc(network, split) {
nodeID, err := strconv.Atoi(idStr)
if err != nil {
t.Fatal(err)
}
uniqueNodes[nodeID] = struct{}{}
}
for node := range uniqueNodes {
nodes = append(nodes, node)
for rows.Next() {
var nodeID int
require.NoError(t, rows.Scan(&nodeID))
require.NotZero(t, nodeID)
nodes = append(nodes, nodeID)
}
sort.Ints(nodes)
return nodes, network
return nodes
}

gossipOK := func(start time.Time, deadNode int) bool {
var expLiveNodes []int
var expGossipNetwork string

for i := 1; i <= c.Spec().NodeCount; i++ {
if elapsed := timeutil.Since(start); elapsed >= 20*time.Second {
Expand All @@ -104,36 +87,35 @@ SELECT string_agg(source_id::TEXT || ':' || target_id::TEXT, ',')
}

t.L().Printf("%d: checking gossip\n", i)
liveNodes, gossipNetwork := nodesInNetworkAccordingTo(i)
liveNodes := gossipNetworkAccordingTo(i)
for _, id := range liveNodes {
if id == deadNode {
t.L().Printf("%d: gossip not ok (dead node %d present): %s (%.0fs)\n",
i, deadNode, gossipNetwork, timeutil.Since(start).Seconds())
t.L().Printf("%d: gossip not ok (dead node %d present) (%.0fs)\n",
i, deadNode, timeutil.Since(start).Seconds())
return false
}
}

if len(expLiveNodes) == 0 {
expLiveNodes = liveNodes
expGossipNetwork = gossipNetwork
continue
}

if len(liveNodes) != len(expLiveNodes) {
t.L().Printf("%d: gossip not ok (mismatched size of network: %s); expected %d, got %d (%.0fs)\n",
i, gossipNetwork, len(expLiveNodes), len(liveNodes), timeutil.Since(start).Seconds())
t.L().Printf("%d: gossip not ok (mismatched size of network); expected %d, got %d (%.0fs)\n",
i, len(expLiveNodes), len(liveNodes), timeutil.Since(start).Seconds())
return false
}

for i := range liveNodes {
if liveNodes[i] != expLiveNodes[i] {
t.L().Printf("%d: gossip not ok (mismatched view of live nodes); expected %s, got %s (%.0fs)\n",
i, gossipNetwork, expLiveNodes, liveNodes, timeutil.Since(start).Seconds())
i, expLiveNodes, liveNodes, timeutil.Since(start).Seconds())
return false
}
}
}
t.L().Printf("gossip ok: %s (size: %d) (%0.0fs)\n", expGossipNetwork, len(expLiveNodes), timeutil.Since(start).Seconds())
t.L().Printf("gossip ok (size: %d) (%0.0fs)\n", len(expLiveNodes), timeutil.Since(start).Seconds())
return true
}

Expand Down
1 change: 0 additions & 1 deletion pkg/gossip/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,6 @@ func (c *client) gossip(
}
if peerID == 0 && c.peerID != 0 {
peerID = c.peerID
g.updateClients()
}
}
}()
Expand Down
Loading

0 comments on commit 2f6b015

Please sign in to comment.