Skip to content

Commit

Permalink
Add a periodic test of the autoseal to detect loss of connectivity. (#…
Browse files Browse the repository at this point in the history
…13078)

* Add a periodic test of the autoseal to detect loss of connectivity

* Keep the logic adjacent to autoseal

* imports

* typo, plus unnecessary constant time compare

* changelog

* pr feedback

* More feedback

* Add locking and a unit test

* unnecessary

* Add timeouts to encrypt/decrypt operations, capture activeContext before starting loop

* Add a block scope for the timeout

* copy/paste ftl

* Refactor to use two timeouts, and cleanup the repetitive failure code

* Readd 0ing gauge

* use millis

* Invert the unit test logic
  • Loading branch information
sgmiller authored Nov 10, 2021
1 parent b325d7b commit 87c2b1a
Show file tree
Hide file tree
Showing 4 changed files with 175 additions and 1 deletion.
3 changes: 3 additions & 0 deletions changelog/13078.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
core: Periodically test the health of connectivity to auto-seal backends
```
8 changes: 8 additions & 0 deletions vault/core.go
Original file line number Diff line number Diff line change
Expand Up @@ -2143,6 +2143,10 @@ func (c *Core) postUnseal(ctx context.Context, ctxCancelFunc context.CancelFunc,
if err := seal.UpgradeKeys(c.activeContext); err != nil {
c.logger.Warn("post-unseal upgrade seal keys failed", "error", err)
}

// Start a periodic but infrequent heartbeat to detect auto-seal backend outages at runtime rather than being
// surprised by this at the next need to unseal.
seal.StartHealthCheck()
}

c.metricsCh = make(chan struct{})
Expand Down Expand Up @@ -2224,6 +2228,10 @@ func (c *Core) preSeal() error {
c.autoRotateCancel = nil
}

if seal, ok := c.seal.(*autoSeal); ok {
seal.StopHealthCheck()
}

preSealPhysical(c)

c.logger.Info("pre-seal teardown complete")
Expand Down
95 changes: 94 additions & 1 deletion vault/seal_autoseal.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
package vault

import (
"bytes"
"context"
"crypto/subtle"
"encoding/json"
"fmt"
mathrand "math/rand"
"sync"
"sync/atomic"
"time"

proto "github.com/golang/protobuf/proto"
log "github.com/hashicorp/go-hclog"
Expand All @@ -16,7 +20,14 @@ import (

// barrierTypeUpgradeCheck checks for backwards compat on barrier type, not
// applicable in the OSS side
var barrierTypeUpgradeCheck = func(_ string, _ *SealConfig) {}
var (
barrierTypeUpgradeCheck = func(_ string, _ *SealConfig) {}
autoSealUnavailableDuration = []string{"seal", "unreachable", "time"}
// vars for unit testings
sealHealthTestIntervalNominal = 10 * time.Minute
sealHealthTestIntervalUnhealthy = 1 * time.Minute
sealHealthTestTimeout = 1 * time.Minute
)

// autoSeal is a Seal implementation that contains logic for encrypting and
// decrypting stored keys via an underlying AutoSealAccess implementation, as
Expand All @@ -28,6 +39,9 @@ type autoSeal struct {
recoveryConfig atomic.Value
core *Core
logger log.Logger

hcLock sync.Mutex
healthCheckStop chan struct{}
}

// Ensure we are implementing the Seal interface
Expand Down Expand Up @@ -499,3 +513,82 @@ func (d *autoSeal) migrateRecoveryConfig(ctx context.Context) error {

return nil
}

// StartHealthCheck starts a goroutine that tests the health of the auto-unseal backend once every 10 minutes.
// If unhealthy, logs a warning on the condition and begins testing every one minute until healthy again.
func (d *autoSeal) StartHealthCheck() {
d.StopHealthCheck()
d.hcLock.Lock()
defer d.hcLock.Unlock()

healthCheck := time.NewTicker(sealHealthTestIntervalNominal)
d.healthCheckStop = make(chan struct{})
healthCheckStop := d.healthCheckStop

go func() {
ctx := d.core.activeContext
lastTestOk := true
lastSeenOk := time.Now()

fail := func(msg string, args ...interface{}) {
d.logger.Warn(msg, args...)
if lastTestOk {
healthCheck.Reset(sealHealthTestIntervalUnhealthy)
}
lastTestOk = false
d.core.MetricSink().SetGauge(autoSealUnavailableDuration, float32(time.Since(lastSeenOk).Milliseconds()))
}
for {
select {
case <-healthCheckStop:
if healthCheck != nil {
healthCheck.Stop()
}
healthCheckStop = nil
return
case t := <-healthCheck.C:
func() {
ctx, cancel := context.WithTimeout(ctx, sealHealthTestTimeout)
defer cancel()

testVal := fmt.Sprintf("Heartbeat %d", mathrand.Intn(1000))
ciphertext, err := d.Access.Encrypt(ctx, []byte(testVal), nil)

if err != nil {
fail("failed to encrypt seal health test value, seal backend may be unreachable", "error", err)
} else {
func() {
ctx, cancel := context.WithTimeout(ctx, sealHealthTestTimeout)
defer cancel()
plaintext, err := d.Access.Decrypt(ctx, ciphertext, nil)
if err != nil {
fail("failed to decrypt seal health test value, seal backend may be unreachable", "error", err)
}
if !bytes.Equal([]byte(testVal), plaintext) {
fail("seal health test value failed to decrypt to expected value")
} else {
d.logger.Debug("seal health test passed")
if !lastTestOk {
d.logger.Info("seal backend is now healthy again", "downtime", t.Sub(lastSeenOk).String())
healthCheck.Reset(sealHealthTestIntervalNominal)
}
lastTestOk = true
lastSeenOk = t
d.core.MetricSink().SetGauge(autoSealUnavailableDuration, 0)
}
}()
}
}()
}
}
}()
}

func (d *autoSeal) StopHealthCheck() {
d.hcLock.Lock()
defer d.hcLock.Unlock()
if d.healthCheckStop != nil {
close(d.healthCheckStop)
d.healthCheckStop = nil
}
}
70 changes: 70 additions & 0 deletions vault/seal_autoseal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,13 @@ package vault
import (
"bytes"
"context"
"errors"
"github.com/armon/go-metrics"
"github.com/hashicorp/vault/helper/metricsutil"
"reflect"
"strings"
"testing"
"time"

proto "github.com/golang/protobuf/proto"
wrapping "github.com/hashicorp/go-kms-wrapping"
Expand Down Expand Up @@ -157,3 +162,68 @@ func TestAutoSeal_UpgradeKeys(t *testing.T) {
}
check()
}

func TestAutoSeal_HealthCheck(t *testing.T) {
inmemSink := metrics.NewInmemSink(
1000000*time.Hour,
2000000*time.Hour)

metricsConf := metrics.DefaultConfig("")
metricsConf.EnableHostname = false
metricsConf.EnableHostnameLabel = false
metricsConf.EnableServiceLabel = false
metricsConf.EnableTypePrefix = false

metrics.NewGlobal(metricsConf, inmemSink)

core, _, _ := TestCoreUnsealed(t)
testSeal, testErr := seal.NewToggleableTestSeal(nil)

var encKeys []string
changeKey := func(key string) {
encKeys = append(encKeys, key)
testSeal.Wrapper.(*seal.ToggleableWrapper).Wrapper.(*wrapping.TestWrapper).SetKeyID(key)
}

// Set initial encryption key.
changeKey("kaz")

autoSeal := NewAutoSeal(testSeal)
autoSeal.SetCore(core)
pBackend := newTestBackend(t)
core.physical = pBackend
core.metricSink = metricsutil.NewClusterMetricSink("", inmemSink)

sealHealthTestIntervalNominal = 10 * time.Millisecond
sealHealthTestIntervalUnhealthy = 10 * time.Millisecond
autoSeal.StartHealthCheck()
*testErr = errors.New("disconnected")

time.Sleep(50 * time.Millisecond)

asu := strings.Join(autoSealUnavailableDuration, ".") + ";cluster="
intervals := inmemSink.Data()
if len(intervals) == 1 {
interval := inmemSink.Data()[0]

if _, ok := interval.Gauges[asu]; !ok {
t.Fatalf("Expected metrics to include a value for gauge %s", asu)
}
if interval.Gauges[asu].Value == 0 {
t.Fatalf("Expected value metric %s to be non-zero", asu)
}
}
*testErr = nil
time.Sleep(50 * time.Millisecond)
intervals = inmemSink.Data()
if len(intervals) == 1 {
interval := inmemSink.Data()[0]

if _, ok := interval.Gauges[asu]; !ok {
t.Fatalf("Expected metrics to include a value for gauge %s", asu)
}
if interval.Gauges[asu].Value != 0 {
t.Fatalf("Expected value metric %s to be non-zero", asu)
}
}
}

0 comments on commit 87c2b1a

Please sign in to comment.