Skip to content

Commit

Permalink
gossip: provide online method to clear leaked gossip infos
Browse files Browse the repository at this point in the history
Fixes cockroachdb#85013.
Needed for cockroachlabs/support#1709.

This commit introduces a new `crdb_internal.unsafe_clear_gossip_info` builtin
function which allows admin users to manually clear info objects from the
cluster's gossip network. The function does so by re-gossiping an identical
value for the specified key but with a TTL that is long enough to reasonably
ensure full propagation to all nodes in the cluster but short enough to expire
quickly once propagated.

The function is best-effort. It is possible for the info object with the low
TTL to fail to reach full propagation before reaching its TTL. For instance,
this is possible during a transient network partition. The effect of this is
that the existing gossip info object with a higher (or no) TTL would remain
in the gossip network on some nodes and may eventually propagate back out to
other nodes once the partition heals.

Release note: None
  • Loading branch information
nvanbenschoten committed Aug 8, 2022
1 parent b89d46f commit 13989e5
Show file tree
Hide file tree
Showing 10 changed files with 125 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docs/generated/sql/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -3036,6 +3036,8 @@ table. Returns an error if validation fails.</p>
</span></td><td>Volatile</td></tr>
<tr><td><a name="crdb_internal.trace_id"></a><code>crdb_internal.trace_id() &rarr; <a href="int.html">int</a></code></td><td><span class="funcdesc"><p>Returns the current trace ID or an error if no trace is open.</p>
</span></td><td>Volatile</td></tr>
<tr><td><a name="crdb_internal.unsafe_clear_gossip_info"></a><code>crdb_internal.unsafe_clear_gossip_info(key: <a href="string.html">string</a>) &rarr; <a href="bool.html">bool</a></code></td><td><span class="funcdesc"><p>This function is used only by CockroachDB’s developers for testing purposes.</p>
</span></td><td>Volatile</td></tr>
<tr><td><a name="crdb_internal.validate_session_revival_token"></a><code>crdb_internal.validate_session_revival_token(token: <a href="bytes.html">bytes</a>) &rarr; <a href="bool.html">bool</a></code></td><td><span class="funcdesc"><p>Validate a token that was created by create_session_revival_token. Intended for testing.</p>
</span></td><td>Volatile</td></tr>
<tr><td><a name="crdb_internal.validate_ttl_scheduled_jobs"></a><code>crdb_internal.validate_ttl_scheduled_jobs() &rarr; void</code></td><td><span class="funcdesc"><p>Validate all TTL tables have a valid scheduled job attached.</p>
Expand Down
37 changes: 37 additions & 0 deletions pkg/gossip/gossip.go
Original file line number Diff line number Diff line change
Expand Up @@ -1066,6 +1066,43 @@ func (g *Gossip) GetInfoProto(key string, msg protoutil.Message) error {
return protoutil.Unmarshal(bytes, msg)
}

// TryClearInfo attempts to clear an info object from the cluster's gossip
// network. It does so by retrieving the object with the corresponding key. If
// one does not exist, there's nothing to do and the method returns false.
// Otherwise, the method re-gossips the same key-value pair with a TTL that is
// long enough to reasonably ensure full propagation to all nodes in the cluster
// but short enough to expire quickly once propagated.
//
// The method is best-effort. It is possible for the info object with the low
// TTL to fail to reach full propagation before reaching its TTL. For instance,
// this is possible during a transient network partition. The effect of this is
// that the existing gossip info object with a higher (or no) TTL would remain
// in the gossip network on some nodes and may eventually propagate back out to
// other nodes once the partition heals.
func (g *Gossip) TryClearInfo(key string) (bool, error) {
// Long enough to propagate to all nodes, short enough to expire quickly.
const ttl = 1 * time.Minute
return g.tryClearInfoWithTTL(key, ttl)
}

func (g *Gossip) tryClearInfoWithTTL(key string, ttl time.Duration) (bool, error) {
val, err := g.GetInfo(key)
if err != nil {
if errors.HasType(err, KeyNotPresentError{}) {
// Info object not known on this node. We can't force a deletion
// preemptively, e.g. with a poison entry, because we do not have a valid
// value object to populate and consumers may make assumptions about the
// format of the value.
return false, nil
}
return false, err
}
if err := g.AddInfo(key, val, ttl); err != nil {
return false, err
}
return true, nil
}

// InfoOriginatedHere returns true iff the latest info for the provided key
// originated on this node. This is useful for ensuring that the system config
// is regossiped as soon as possible when its lease changes hands.
Expand Down
20 changes: 20 additions & 0 deletions pkg/gossip/gossip_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -848,6 +848,26 @@ func TestGossipPropagation(t *testing.T) {
}
return nil
})

mustClear := func(g *Gossip, key string) {
if _, err := g.tryClearInfoWithTTL(key, 3*time.Second); err != nil {
t.Fatal(err)
}
}

// Clear both entries. Verify that both are removed from the gossip network.
mustClear(local, "local")
mustClear(remote, "remote")
testutils.SucceedsSoon(t, func() error {
for gName, g := range map[string]*Gossip{"local": local, "remote": remote} {
for _, key := range []string{"local", "remote"} {
if getInfo(g, key) != nil {
return fmt.Errorf("%s info %q not cleared", gName, key)
}
}
}
return nil
})
}

// Test whether propagation of an info that was generated by a prior
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ go_library(
"explain_vec.go",
"export.go",
"filter.go",
"gossip.go",
"grant_revoke.go",
"grant_role.go",
"group.go",
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/conn_executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -2683,6 +2683,7 @@ func (ex *connExecutor) initEvalCtx(ctx context.Context, evalCtx *extendedEvalCo
Tenant: p,
Regions: p,
JoinTokenCreator: p,
Gossip: p,
PreparedStatementState: &ex.extraTxnState.prepStmtsNamespace,
SessionDataStack: ex.sessionDataStack,
ReCache: ex.server.reCache,
Expand Down
25 changes: 25 additions & 0 deletions pkg/sql/gossip.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// Copyright 2022 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package sql

import "context"

// TryClearGossipInfo implements the tree.GossipOperator interface.
func (p *planner) TryClearGossipInfo(ctx context.Context, key string) (bool, error) {
g, err := p.ExecCfg().Gossip.OptionalErr(0 /* issue */)
if err != nil {
return false, err
}
if err := p.RequireAdminRole(ctx, "try clear gossip info"); err != nil {
return false, err
}
return g.TryClearInfo(key)
}
6 changes: 6 additions & 0 deletions pkg/sql/logictest/testdata/logic_test/crdb_internal
Original file line number Diff line number Diff line change
Expand Up @@ -1134,3 +1134,9 @@ statement ok
CREATE TABLE t69684(a NAME);
INSERT INTO t69684 VALUES ('foo');
SELECT * FROM t69684 WHERE crdb_internal.increment_feature_counter(a)

# Exercise unsafe gossip builtin functions.
query B
SELECT crdb_internal.unsafe_clear_gossip_info('unknown key')
----
false
1 change: 1 addition & 0 deletions pkg/sql/planner.go
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,7 @@ func newInternalPlanner(
p.extendedEvalCtx.Tenant = p
p.extendedEvalCtx.Regions = p
p.extendedEvalCtx.JoinTokenCreator = p
p.extendedEvalCtx.Gossip = p
p.extendedEvalCtx.ClusterID = execCfg.LogicalClusterID()
p.extendedEvalCtx.ClusterName = execCfg.RPCContext.ClusterName()
p.extendedEvalCtx.NodeID = execCfg.NodeID
Expand Down
21 changes: 21 additions & 0 deletions pkg/sql/sem/builtins/builtins.go
Original file line number Diff line number Diff line change
Expand Up @@ -4899,6 +4899,27 @@ value if you rely on the HLC for accuracy.`,
},
),

"crdb_internal.unsafe_clear_gossip_info": makeBuiltin(
tree.FunctionProperties{Category: categorySystemInfo},
tree.Overload{
Types: tree.ArgTypes{{"key", types.String}},
ReturnType: tree.FixedReturnType(types.Bool),
Fn: func(ctx *tree.EvalContext, args tree.Datums) (tree.Datum, error) {
key, ok := tree.AsDString(args[0])
if !ok {
return nil, errors.Newf("expected string value, got %T", args[0])
}
ok, err := ctx.Gossip.TryClearGossipInfo(ctx.Context, string(key))
if err != nil {
return nil, err
}
return tree.MakeDBool(tree.DBool(ok)), nil
},
Info: "This function is used only by CockroachDB's developers for testing purposes.",
Volatility: tree.VolatilityVolatile,
},
),

"crdb_internal.encode_key": makeBuiltin(
tree.FunctionProperties{Category: categorySystemInfo},
tree.Overload{
Expand Down
11 changes: 11 additions & 0 deletions pkg/sql/sem/tree/eval.go
Original file line number Diff line number Diff line change
Expand Up @@ -3532,6 +3532,15 @@ type JoinTokenCreator interface {
CreateJoinToken(ctx context.Context) (string, error)
}

// GossipOperator is capable of manipulating the cluster's gossip network. The
// methods will return errors when run by any tenant other than the system
// tenant.
type GossipOperator interface {
// TryClearGossipInfo attempts to clear an info object from the cluster's
// gossip network.
TryClearGossipInfo(ctx context.Context, key string) (bool, error)
}

// EvalContextTestingKnobs contains test knobs.
type EvalContextTestingKnobs struct {
// AssertFuncExprReturnTypes indicates whether FuncExpr evaluations
Expand Down Expand Up @@ -3690,6 +3699,8 @@ type EvalContext struct {

JoinTokenCreator JoinTokenCreator

Gossip GossipOperator

PreparedStatementState PreparedStatementState

// The transaction in which the statement is executing.
Expand Down

0 comments on commit 13989e5

Please sign in to comment.