Skip to content

Commit

Permalink
keyring: remove root key GC (#15034)
Browse files Browse the repository at this point in the history
  • Loading branch information
tgross authored Oct 25, 2022
1 parent d978e77 commit b583f78
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 155 deletions.
3 changes: 3 additions & 0 deletions .changelog/15034.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
keyring: Removed root key garbage collection to avoid orphaned workload identities
```
99 changes: 10 additions & 89 deletions nomad/core_sched.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ func (c *CoreScheduler) Process(eval *structs.Evaluation) error {
case structs.CoreJobGlobalTokenExpiredGC:
return c.expiredACLTokenGC(eval, true)
case structs.CoreJobRootKeyRotateOrGC:
return c.rootKeyRotateOrGC(eval)
return c.rootKeyRotate(eval)
case structs.CoreJobVariablesRekey:
return c.variablesRekey(eval)
case structs.CoreJobForceGC:
Expand Down Expand Up @@ -96,9 +96,7 @@ func (c *CoreScheduler) forceGC(eval *structs.Evaluation) error {
if err := c.expiredACLTokenGC(eval, true); err != nil {
return err
}
if err := c.rootKeyGC(eval); err != nil {
return err
}

// Node GC must occur after the others to ensure the allocations are
// cleared.
return c.nodeGC(eval)
Expand Down Expand Up @@ -895,100 +893,23 @@ func (c *CoreScheduler) expiredACLTokenGC(eval *structs.Evaluation, global bool)
return c.srv.RPC(structs.ACLDeleteTokensRPCMethod, req, &structs.GenericResponse{})
}

// rootKeyRotateOrGC is used to rotate or garbage collect root keys
func (c *CoreScheduler) rootKeyRotateOrGC(eval *structs.Evaluation) error {

// a rotation will be sent to the leader so our view of state
// is no longer valid. we ack this core job and will pick up
// the GC work on the next interval
wasRotated, err := c.rootKeyRotation(eval)
if err != nil {
return err
}
if wasRotated {
return nil
}
return c.rootKeyGC(eval)
}

func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation) error {

// we can't GC any key older than the oldest live allocation
// because it might have signed that allocation's workload
// identity; this is conservative so that we don't have to iterate
// over all the allocations and find out which keys signed their
// identity, which will be expensive on large clusters
allocOldThreshold, err := c.getOldestAllocationIndex()
if err != nil {
return err
}

oldThreshold := c.getThreshold(eval, "root key",
"root_key_gc_threshold", c.srv.config.RootKeyGCThreshold)

ws := memdb.NewWatchSet()
iter, err := c.snap.RootKeyMetas(ws)
if err != nil {
return err
}

for {
raw := iter.Next()
if raw == nil {
break
}
keyMeta := raw.(*structs.RootKeyMeta)
if keyMeta.Active() {
continue // never GC the active key
}
if keyMeta.CreateIndex > oldThreshold {
continue // don't GC recent keys
}
if keyMeta.CreateIndex > allocOldThreshold {
continue // don't GC keys possibly used to sign live allocations
}
varIter, err := c.snap.GetVariablesByKeyID(ws, keyMeta.KeyID)
if err != nil {
return err
}
if varIter.Next() != nil {
continue // key is still in use
}

req := &structs.KeyringDeleteRootKeyRequest{
KeyID: keyMeta.KeyID,
WriteRequest: structs.WriteRequest{
Region: c.srv.config.Region,
AuthToken: eval.LeaderACL,
},
}
if err := c.srv.RPC("Keyring.Delete",
req, &structs.KeyringDeleteRootKeyResponse{}); err != nil {
c.logger.Error("root key delete failed", "error", err)
return err
}
}

return nil
}

// rootKeyRotation checks if the active key is old enough that we need
// to kick off a rotation. Returns true if the key was rotated.
func (c *CoreScheduler) rootKeyRotation(eval *structs.Evaluation) (bool, error) {
// rootKeyRotate checks if the active key is old enough that we need
// to kick off a rotation.
func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation) error {

rotationThreshold := c.getThreshold(eval, "root key",
"root_key_rotation_threshold", c.srv.config.RootKeyRotationThreshold)

ws := memdb.NewWatchSet()
activeKey, err := c.snap.GetActiveRootKeyMeta(ws)
if err != nil {
return false, err
return err
}
if activeKey == nil {
return false, nil // no active key
return nil // no active key
}
if activeKey.CreateIndex >= rotationThreshold {
return false, nil // key is too new
return nil // key is too new
}

req := &structs.KeyringRotateRootKeyRequest{
Expand All @@ -1000,10 +921,10 @@ func (c *CoreScheduler) rootKeyRotation(eval *structs.Evaluation) (bool, error)
if err := c.srv.RPC("Keyring.Rotate",
req, &structs.KeyringRotateRootKeyResponse{}); err != nil {
c.logger.Error("root key rotation failed", "error", err)
return false, err
return err
}

return true, nil
return nil
}

// variablesReKey is optionally run after rotating the active
Expand Down
74 changes: 17 additions & 57 deletions nomad/core_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2471,73 +2471,33 @@ func TestCoreScheduler_RootKeyGC(t *testing.T) {
require.NotNil(t, key0, "expected keyring to be bootstapped")
require.NoError(t, err)

// insert an "old" and inactive key
key1 := structs.NewRootKeyMeta()
key1.SetInactive()
require.NoError(t, store.UpsertRootKeyMeta(500, key1, false))

// insert an "old" and inactive key with a variable that's using it
key2 := structs.NewRootKeyMeta()
key2.SetInactive()
require.NoError(t, store.UpsertRootKeyMeta(600, key2, false))

variable := mock.VariableEncrypted()
variable.KeyID = key2.KeyID

setResp := store.VarSet(601, &structs.VarApplyStateRequest{
Op: structs.VarOpSet,
Var: variable,
})
require.NoError(t, setResp.Error)

// insert an allocation
alloc := mock.Alloc()
alloc.ClientStatus = structs.AllocClientStatusRunning
require.NoError(t, store.UpsertAllocs(
structs.MsgTypeTestSetup, 700, []*structs.Allocation{alloc}))

// insert an "old" key that's newer than oldest alloc
key3 := structs.NewRootKeyMeta()
key3.SetInactive()
require.NoError(t, store.UpsertRootKeyMeta(750, key3, false))

// insert a time table index before the last key
tt := srv.fsm.TimeTable()
tt.Witness(1000, time.Now().UTC().Add(-1*srv.config.RootKeyGCThreshold))

// insert a "new" but inactive key
key4 := structs.NewRootKeyMeta()
key4.SetInactive()
require.NoError(t, store.UpsertRootKeyMeta(1500, key4, false))

// run the core job
snap, err := store.Snapshot()
require.NoError(t, err)
core := NewCoreScheduler(srv, snap)
eval := srv.coreJobEval(structs.CoreJobRootKeyRotateOrGC, 2000)
c := core.(*CoreScheduler)
require.NoError(t, c.rootKeyRotateOrGC(eval))
require.NoError(t, c.rootKeyRotate(eval))

ws := memdb.NewWatchSet()
key, err := store.RootKeyMetaByID(ws, key0.KeyID)
require.NoError(t, err)
require.NotNil(t, key, "active key should not have been GCd")
got, err := store.GetActiveRootKeyMeta(nil)
require.NotNil(t, got, "expected keyring to have an active key")
require.Equal(t, got.KeyID, key0.KeyID)

key, err = store.RootKeyMetaByID(ws, key1.KeyID)
require.NoError(t, err)
require.Nil(t, key, "old key should have been GCd")

key, err = store.RootKeyMetaByID(ws, key2.KeyID)
require.NoError(t, err)
require.NotNil(t, key, "old key should not have been GCd if still in use")

key, err = store.RootKeyMetaByID(ws, key3.KeyID)
require.NoError(t, err)
require.NotNil(t, key, "old key newer than oldest alloc should not have been GCd")
// insert a time table index after the key
tt := srv.fsm.TimeTable()
tt.Witness(3000, time.Now().UTC().Add(-1*srv.config.RootKeyRotationThreshold))

key, err = store.RootKeyMetaByID(ws, key4.KeyID)
// re-run the core job
snap, err = store.Snapshot()
require.NoError(t, err)
require.NotNil(t, key, "new key should not have been GCd")
core = NewCoreScheduler(srv, snap)
eval = srv.coreJobEval(structs.CoreJobRootKeyRotateOrGC, 4000)
c = core.(*CoreScheduler)
require.NoError(t, c.rootKeyRotate(eval))

got, err = store.GetActiveRootKeyMeta(nil)
require.NotNil(t, got, "expected keyring to have an active key")
require.NotEqual(t, got.KeyID, key0.KeyID)
}

// TestCoreScheduler_VariablesRekey exercises variables rekeying
Expand Down
14 changes: 5 additions & 9 deletions website/content/docs/configuration/server.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -200,15 +200,11 @@ server {
rejoin the cluster.

- `root_key_gc_interval` `(string: "10m")` - Specifies the interval between
[encryption key][] metadata garbage collections.

- `root_key_gc_threshold` `(string: "1h")` - Specifies the minimum time that an
[encryption key][] must exist before it can be eligible for garbage
collection.
checks to rotate the root [encryption key][].

- `root_key_rotation_threshold` `(string: "720h")` - Specifies the minimum time
that an [encryption key][] must exist before it is automatically rotated on
the next garbage collection interval.
the next `root_key_gc_interval`.

- `server_join` <code>([server_join][server-join]: nil)</code> - Specifies
how the Nomad server will connect to other Nomad servers. The `retry_join`
Expand Down Expand Up @@ -392,7 +388,7 @@ regardless of cluster size.
The base formula for determining how often a Client must heartbeat is:

```
<number of Clients> / <max_heartbeats_per_second>
<number of Clients> / <max_heartbeats_per_second>
```

Other factors modify this base TTL:
Expand All @@ -408,7 +404,7 @@ Other factors modify this base TTL:
to successfully heartbeat. This gives Clients time to discover a functioning
Server in case they were directly connected to a leader that crashed.

For example, given the default values for heartbeat parameters, different sized
For example, given the default values for heartbeat parameters, different sized
clusters will use the following TTLs for the heartbeats. Note that the `Server TTL`
simply adds the `heartbeat_grace` parameter to the TTL Clients are given.

Expand All @@ -425,7 +421,7 @@ Regardless of size, all clients will have a Server TTL of
than the maximum Client TTL for your cluster size in order to prevent marking
live Clients as `down`.

For clusters over 5000 Clients you should increase `failover_heartbeat_ttl`
For clusters over 5000 Clients you should increase `failover_heartbeat_ttl`
using the following formula:

```
Expand Down

0 comments on commit b583f78

Please sign in to comment.