Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

release-22.2: gossip: remove frequent gossiping of gossip client connections #94072

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions pkg/cli/debug.go
Original file line number Diff line number Diff line change
Expand Up @@ -1071,8 +1071,6 @@ func parseGossipValues(gossipInfo *gossip.InfoStatus) (string, error) {
return "", errors.Wrapf(err, "failed to parse value for key %q", key)
}
output = append(output, fmt.Sprintf("%q: %+v", key, drainingInfo))
} else if strings.HasPrefix(key, gossip.KeyGossipClientsPrefix) {
output = append(output, fmt.Sprintf("%q: %v", key, string(bytes)))
}
}

Expand Down
5 changes: 0 additions & 5 deletions pkg/cli/testdata/zip/partial1
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0s /dev/null
[node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... done
[node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done
[node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done
Expand Down Expand Up @@ -177,9 +176,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0s /dev/null
[node 2] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/2/crdb_internal.gossip_liveness.txt...
[node 2] retrieving SQL data for crdb_internal.gossip_liveness: last request failed: dial tcp ...
[node 2] retrieving SQL data for crdb_internal.gossip_liveness: creating error output: debug/nodes/2/crdb_internal.gossip_liveness.txt.err.txt... done
[node 2] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/2/crdb_internal.gossip_network.txt...
[node 2] retrieving SQL data for crdb_internal.gossip_network: last request failed: dial tcp ...
[node 2] retrieving SQL data for crdb_internal.gossip_network: creating error output: debug/nodes/2/crdb_internal.gossip_network.txt.err.txt... done
[node 2] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/2/crdb_internal.gossip_nodes.txt...
[node 2] retrieving SQL data for crdb_internal.gossip_nodes: last request failed: dial tcp ...
[node 2] retrieving SQL data for crdb_internal.gossip_nodes: creating error output: debug/nodes/2/crdb_internal.gossip_nodes.txt.err.txt... done
Expand Down Expand Up @@ -261,7 +257,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0s /dev/null
[node 3] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/3/crdb_internal.feature_usage.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/3/crdb_internal.gossip_alerts.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/3/crdb_internal.gossip_liveness.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/3/crdb_internal.gossip_network.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/3/crdb_internal.gossip_nodes.txt... done
[node 3] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/3/crdb_internal.leases.txt... done
[node 3] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/3/crdb_internal.node_build_info.txt... done
Expand Down
2 changes: 0 additions & 2 deletions pkg/cli/testdata/zip/partial1_excluded
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ debug zip /dev/null --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0
[node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... done
[node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done
[node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done
Expand Down Expand Up @@ -170,7 +169,6 @@ debug zip /dev/null --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0
[node 3] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/3/crdb_internal.feature_usage.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/3/crdb_internal.gossip_alerts.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/3/crdb_internal.gossip_liveness.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/3/crdb_internal.gossip_network.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/3/crdb_internal.gossip_nodes.txt... done
[node 3] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/3/crdb_internal.leases.txt... done
[node 3] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/3/crdb_internal.node_build_info.txt... done
Expand Down
2 changes: 0 additions & 2 deletions pkg/cli/testdata/zip/partial2
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0 /dev/null
[node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... done
[node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done
[node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done
Expand Down Expand Up @@ -169,7 +168,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0 /dev/null
[node 3] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/3/crdb_internal.feature_usage.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/3/crdb_internal.gossip_alerts.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/3/crdb_internal.gossip_liveness.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/3/crdb_internal.gossip_network.txt... done
[node 3] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/3/crdb_internal.gossip_nodes.txt... done
[node 3] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/3/crdb_internal.leases.txt... done
[node 3] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/3/crdb_internal.node_build_info.txt... done
Expand Down
1 change: 0 additions & 1 deletion pkg/cli/testdata/zip/testzip
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... done
[node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done
[node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done
Expand Down
9 changes: 0 additions & 9 deletions pkg/cli/testdata/zip/testzip_concurrent
Original file line number Diff line number Diff line change
Expand Up @@ -292,9 +292,6 @@ zip
[node 1] retrieving SQL data for crdb_internal.gossip_liveness...
[node 1] retrieving SQL data for crdb_internal.gossip_liveness: done
[node 1] retrieving SQL data for crdb_internal.gossip_liveness: writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt...
[node 1] retrieving SQL data for crdb_internal.gossip_network...
[node 1] retrieving SQL data for crdb_internal.gossip_network: done
[node 1] retrieving SQL data for crdb_internal.gossip_network: writing output: debug/nodes/1/crdb_internal.gossip_network.txt...
[node 1] retrieving SQL data for crdb_internal.gossip_nodes...
[node 1] retrieving SQL data for crdb_internal.gossip_nodes: done
[node 1] retrieving SQL data for crdb_internal.gossip_nodes: writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt...
Expand Down Expand Up @@ -614,9 +611,6 @@ zip
[node 2] retrieving SQL data for crdb_internal.gossip_liveness...
[node 2] retrieving SQL data for crdb_internal.gossip_liveness: done
[node 2] retrieving SQL data for crdb_internal.gossip_liveness: writing output: debug/nodes/2/crdb_internal.gossip_liveness.txt...
[node 2] retrieving SQL data for crdb_internal.gossip_network...
[node 2] retrieving SQL data for crdb_internal.gossip_network: done
[node 2] retrieving SQL data for crdb_internal.gossip_network: writing output: debug/nodes/2/crdb_internal.gossip_network.txt...
[node 2] retrieving SQL data for crdb_internal.gossip_nodes...
[node 2] retrieving SQL data for crdb_internal.gossip_nodes: done
[node 2] retrieving SQL data for crdb_internal.gossip_nodes: writing output: debug/nodes/2/crdb_internal.gossip_nodes.txt...
Expand Down Expand Up @@ -936,9 +930,6 @@ zip
[node 3] retrieving SQL data for crdb_internal.gossip_liveness...
[node 3] retrieving SQL data for crdb_internal.gossip_liveness: done
[node 3] retrieving SQL data for crdb_internal.gossip_liveness: writing output: debug/nodes/3/crdb_internal.gossip_liveness.txt...
[node 3] retrieving SQL data for crdb_internal.gossip_network...
[node 3] retrieving SQL data for crdb_internal.gossip_network: done
[node 3] retrieving SQL data for crdb_internal.gossip_network: writing output: debug/nodes/3/crdb_internal.gossip_network.txt...
[node 3] retrieving SQL data for crdb_internal.gossip_nodes...
[node 3] retrieving SQL data for crdb_internal.gossip_nodes: done
[node 3] retrieving SQL data for crdb_internal.gossip_nodes: writing output: debug/nodes/3/crdb_internal.gossip_nodes.txt...
Expand Down
3 changes: 0 additions & 3 deletions pkg/cli/testdata/zip/testzip_tenant
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,6 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt...
[node 1] retrieving SQL data for crdb_internal.gossip_liveness: last request failed: ERROR: unimplemented: operation is unsupported in multi-tenancy mode (SQLSTATE 0A000)
[node 1] retrieving SQL data for crdb_internal.gossip_liveness: creating error output: debug/nodes/1/crdb_internal.gossip_liveness.txt.err.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt...
[node 1] retrieving SQL data for crdb_internal.gossip_network: last request failed: ERROR: unimplemented: operation is unsupported in multi-tenancy mode (SQLSTATE 0A000)
[node 1] retrieving SQL data for crdb_internal.gossip_network: creating error output: debug/nodes/1/crdb_internal.gossip_network.txt.err.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt...
[node 1] retrieving SQL data for crdb_internal.gossip_nodes: last request failed: ERROR: unimplemented: operation is unsupported in multi-tenancy mode (SQLSTATE 0A000)
[node 1] retrieving SQL data for crdb_internal.gossip_nodes: creating error output: debug/nodes/1/crdb_internal.gossip_nodes.txt.err.txt... done
Expand Down
1 change: 0 additions & 1 deletion pkg/cli/testdata/zip/unavailable
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ debug zip --concurrency=1 --cpu-profile-duration=0 /dev/null --timeout=.5s
[node 1] retrieving SQL data for crdb_internal.feature_usage... writing output: debug/nodes/1/crdb_internal.feature_usage.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_alerts... writing output: debug/nodes/1/crdb_internal.gossip_alerts.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_liveness... writing output: debug/nodes/1/crdb_internal.gossip_liveness.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_network... writing output: debug/nodes/1/crdb_internal.gossip_network.txt... done
[node 1] retrieving SQL data for crdb_internal.gossip_nodes... writing output: debug/nodes/1/crdb_internal.gossip_nodes.txt... done
[node 1] retrieving SQL data for crdb_internal.leases... writing output: debug/nodes/1/crdb_internal.leases.txt... done
[node 1] retrieving SQL data for crdb_internal.node_build_info... writing output: debug/nodes/1/crdb_internal.node_build_info.txt... done
Expand Down
6 changes: 0 additions & 6 deletions pkg/cli/zip_table_registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -533,12 +533,6 @@ var zipInternalTablesPerNode = DebugZipTableRegistry{
"updated_at",
},
},
"crdb_internal.gossip_network": {
nonSensitiveCols: NonSensitiveColumns{
"source_id",
"target_id",
},
},
"crdb_internal.gossip_nodes": {
// `cluster_name` is hashed as we only care to see whether values are
// identical across nodes.
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/zip_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ table_name NOT IN (
'cross_db_references',
'databases',
'forward_dependencies',
'gossip_network',
'index_columns',
'lost_descriptors_with_data',
'table_columns',
Expand Down
2 changes: 1 addition & 1 deletion pkg/cmd/roachtest/tests/cli.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ func runCLINodeStatus(ctx context.Context, t test.Test, c cluster.Cluster) {
c.Stop(ctx, t.L(), option.DefaultStopOpts(), c.Node(3))
waitUntil([]string{
"is_available is_live",
"false true",
"false false",
"false false",
"false false",
})
Expand Down
64 changes: 23 additions & 41 deletions pkg/cmd/roachtest/tests/gossip.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (
"strconv"
"strings"
"time"
"unicode"

"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
Expand All @@ -47,52 +46,36 @@ func registerGossip(r registry.Registry) {
err := WaitFor3XReplication(ctx, t, c.Conn(ctx, t.L(), 1))
require.NoError(t, err)

// TODO(irfansharif): We could also look at gossip_liveness to determine
// cluster membership as seen by each gossip module, and ensure each
// node's gossip excludes the dead node and includes all other live
// ones.
gossipNetworkAccordingTo := func(node int) (network string) {
gossipNetworkAccordingTo := func(node int) (nodes []int) {
// Expiration timestamp format: <seconds>.<nanos>,<logical>. We'll
// capture just the <seconds> portion.
const query = `
SELECT string_agg(source_id::TEXT || ':' || target_id::TEXT, ',')
FROM (SELECT * FROM crdb_internal.gossip_network ORDER BY source_id, target_id)
SELECT node_id
FROM (SELECT node_id, to_timestamp(split_part(split_part(expiration, ',', 1), '.', 1)::FLOAT8) AS expiration
FROM crdb_internal.gossip_liveness)
WHERE expiration > now();
`

db := c.Conn(ctx, t.L(), node)
defer db.Close()
var s gosql.NullString
if err = db.QueryRow(query).Scan(&s); err != nil {
t.Fatal(err)
}
if s.Valid {
return s.String
}
return ""
}

nodesInNetworkAccordingTo := func(node int) (nodes []int, network string) {
split := func(c rune) bool {
return !unicode.IsNumber(c)
rows, err := db.Query(query)
if err != nil {
t.Fatal(err)
}

uniqueNodes := make(map[int]struct{})
network = gossipNetworkAccordingTo(node)
for _, idStr := range strings.FieldsFunc(network, split) {
nodeID, err := strconv.Atoi(idStr)
if err != nil {
t.Fatal(err)
}
uniqueNodes[nodeID] = struct{}{}
}
for node := range uniqueNodes {
nodes = append(nodes, node)
for rows.Next() {
var nodeID int
require.NoError(t, rows.Scan(&nodeID))
require.NotZero(t, nodeID)
nodes = append(nodes, nodeID)
}
sort.Ints(nodes)
return nodes, network
return nodes
}

gossipOK := func(start time.Time, deadNode int) bool {
var expLiveNodes []int
var expGossipNetwork string

for i := 1; i <= c.Spec().NodeCount; i++ {
if elapsed := timeutil.Since(start); elapsed >= 20*time.Second {
Expand All @@ -104,36 +87,35 @@ SELECT string_agg(source_id::TEXT || ':' || target_id::TEXT, ',')
}

t.L().Printf("%d: checking gossip\n", i)
liveNodes, gossipNetwork := nodesInNetworkAccordingTo(i)
liveNodes := gossipNetworkAccordingTo(i)
for _, id := range liveNodes {
if id == deadNode {
t.L().Printf("%d: gossip not ok (dead node %d present): %s (%.0fs)\n",
i, deadNode, gossipNetwork, timeutil.Since(start).Seconds())
t.L().Printf("%d: gossip not ok (dead node %d present) (%.0fs)\n",
i, deadNode, timeutil.Since(start).Seconds())
return false
}
}

if len(expLiveNodes) == 0 {
expLiveNodes = liveNodes
expGossipNetwork = gossipNetwork
continue
}

if len(liveNodes) != len(expLiveNodes) {
t.L().Printf("%d: gossip not ok (mismatched size of network: %s); expected %d, got %d (%.0fs)\n",
i, gossipNetwork, len(expLiveNodes), len(liveNodes), timeutil.Since(start).Seconds())
t.L().Printf("%d: gossip not ok (mismatched size of network); expected %d, got %d (%.0fs)\n",
i, len(expLiveNodes), len(liveNodes), timeutil.Since(start).Seconds())
return false
}

for i := range liveNodes {
if liveNodes[i] != expLiveNodes[i] {
t.L().Printf("%d: gossip not ok (mismatched view of live nodes); expected %s, got %s (%.0fs)\n",
i, gossipNetwork, expLiveNodes, liveNodes, timeutil.Since(start).Seconds())
i, expLiveNodes, liveNodes, timeutil.Since(start).Seconds())
return false
}
}
}
t.L().Printf("gossip ok: %s (size: %d) (%0.0fs)\n", expGossipNetwork, len(expLiveNodes), timeutil.Since(start).Seconds())
t.L().Printf("gossip ok (size: %d) (%0.0fs)\n", len(expLiveNodes), timeutil.Since(start).Seconds())
return true
}

Expand Down
1 change: 0 additions & 1 deletion pkg/gossip/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,6 @@ func (c *client) gossip(
}
if peerID == 0 && c.peerID != 0 {
peerID = c.peerID
g.updateClients()
}
}
}()
Expand Down
Loading