Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a periodic test of the autoseal to detect loss of connectivity. #13078

Merged
merged 17 commits into from
Nov 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions changelog/13078.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
core: Periodically test the health of connectivity to auto-seal backends
```
8 changes: 8 additions & 0 deletions vault/core.go
Original file line number Diff line number Diff line change
Expand Up @@ -2143,6 +2143,10 @@ func (c *Core) postUnseal(ctx context.Context, ctxCancelFunc context.CancelFunc,
if err := seal.UpgradeKeys(c.activeContext); err != nil {
c.logger.Warn("post-unseal upgrade seal keys failed", "error", err)
}

// Start a periodic but infrequent heartbeat to detect auto-seal backend outages at runtime rather than being
// surprised by this at the next need to unseal.
seal.StartHealthCheck()
}

c.metricsCh = make(chan struct{})
Expand Down Expand Up @@ -2224,6 +2228,10 @@ func (c *Core) preSeal() error {
c.autoRotateCancel = nil
}

if seal, ok := c.seal.(*autoSeal); ok {
seal.StopHealthCheck()
}

preSealPhysical(c)

c.logger.Info("pre-seal teardown complete")
Expand Down
95 changes: 94 additions & 1 deletion vault/seal_autoseal.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
package vault

import (
"bytes"
"context"
"crypto/subtle"
"encoding/json"
"fmt"
mathrand "math/rand"
"sync"
"sync/atomic"
"time"

proto "github.com/golang/protobuf/proto"
log "github.com/hashicorp/go-hclog"
Expand All @@ -16,7 +20,14 @@ import (

// barrierTypeUpgradeCheck checks for backwards compat on barrier type, not
// applicable in the OSS side
var barrierTypeUpgradeCheck = func(_ string, _ *SealConfig) {}
var (
barrierTypeUpgradeCheck = func(_ string, _ *SealConfig) {}
autoSealUnavailableDuration = []string{"seal", "unreachable", "time"}
// vars for unit testings
sealHealthTestIntervalNominal = 10 * time.Minute
sealHealthTestIntervalUnhealthy = 1 * time.Minute
sealHealthTestTimeout = 1 * time.Minute
)

// autoSeal is a Seal implementation that contains logic for encrypting and
// decrypting stored keys via an underlying AutoSealAccess implementation, as
Expand All @@ -28,6 +39,9 @@ type autoSeal struct {
recoveryConfig atomic.Value
core *Core
logger log.Logger

hcLock sync.Mutex
healthCheckStop chan struct{}
}

// Ensure we are implementing the Seal interface
Expand Down Expand Up @@ -499,3 +513,82 @@ func (d *autoSeal) migrateRecoveryConfig(ctx context.Context) error {

return nil
}

// StartHealthCheck starts a goroutine that tests the health of the auto-unseal backend once every 10 minutes.
// If unhealthy, logs a warning on the condition and begins testing every one minute until healthy again.
func (d *autoSeal) StartHealthCheck() {
d.StopHealthCheck()
d.hcLock.Lock()
defer d.hcLock.Unlock()

healthCheck := time.NewTicker(sealHealthTestIntervalNominal)
d.healthCheckStop = make(chan struct{})
ncabatoff marked this conversation as resolved.
Show resolved Hide resolved
healthCheckStop := d.healthCheckStop

go func() {
ctx := d.core.activeContext
lastTestOk := true
ncabatoff marked this conversation as resolved.
Show resolved Hide resolved
lastSeenOk := time.Now()

fail := func(msg string, args ...interface{}) {
d.logger.Warn(msg, args...)
if lastTestOk {
healthCheck.Reset(sealHealthTestIntervalUnhealthy)
}
lastTestOk = false
d.core.MetricSink().SetGauge(autoSealUnavailableDuration, float32(time.Since(lastSeenOk).Milliseconds()))
}
for {
select {
case <-healthCheckStop:
if healthCheck != nil {
healthCheck.Stop()
}
healthCheckStop = nil
return
case t := <-healthCheck.C:
func() {
ncabatoff marked this conversation as resolved.
Show resolved Hide resolved
ctx, cancel := context.WithTimeout(ctx, sealHealthTestTimeout)
defer cancel()

testVal := fmt.Sprintf("Heartbeat %d", mathrand.Intn(1000))
ciphertext, err := d.Access.Encrypt(ctx, []byte(testVal), nil)

if err != nil {
fail("failed to encrypt seal health test value, seal backend may be unreachable", "error", err)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that we're in a func we could do an early return instead of an else.

} else {
func() {
ctx, cancel := context.WithTimeout(ctx, sealHealthTestTimeout)
defer cancel()
plaintext, err := d.Access.Decrypt(ctx, ciphertext, nil)
if err != nil {
fail("failed to decrypt seal health test value, seal backend may be unreachable", "error", err)
}
if !bytes.Equal([]byte(testVal), plaintext) {
fail("seal health test value failed to decrypt to expected value")
} else {
d.logger.Debug("seal health test passed")
if !lastTestOk {
d.logger.Info("seal backend is now healthy again", "downtime", t.Sub(lastSeenOk).String())
healthCheck.Reset(sealHealthTestIntervalNominal)
}
lastTestOk = true
lastSeenOk = t
d.core.MetricSink().SetGauge(autoSealUnavailableDuration, 0)
}
}()
}
}()
}
}
}()
}

func (d *autoSeal) StopHealthCheck() {
d.hcLock.Lock()
defer d.hcLock.Unlock()
if d.healthCheckStop != nil {
close(d.healthCheckStop)
ncabatoff marked this conversation as resolved.
Show resolved Hide resolved
d.healthCheckStop = nil
}
}
70 changes: 70 additions & 0 deletions vault/seal_autoseal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,13 @@ package vault
import (
"bytes"
"context"
"errors"
"github.com/armon/go-metrics"
"github.com/hashicorp/vault/helper/metricsutil"
"reflect"
"strings"
"testing"
"time"

proto "github.com/golang/protobuf/proto"
wrapping "github.com/hashicorp/go-kms-wrapping"
Expand Down Expand Up @@ -157,3 +162,68 @@ func TestAutoSeal_UpgradeKeys(t *testing.T) {
}
check()
}

func TestAutoSeal_HealthCheck(t *testing.T) {
inmemSink := metrics.NewInmemSink(
1000000*time.Hour,
2000000*time.Hour)

metricsConf := metrics.DefaultConfig("")
metricsConf.EnableHostname = false
metricsConf.EnableHostnameLabel = false
metricsConf.EnableServiceLabel = false
metricsConf.EnableTypePrefix = false

metrics.NewGlobal(metricsConf, inmemSink)

core, _, _ := TestCoreUnsealed(t)
testSeal, testErr := seal.NewToggleableTestSeal(nil)

var encKeys []string
changeKey := func(key string) {
encKeys = append(encKeys, key)
testSeal.Wrapper.(*seal.ToggleableWrapper).Wrapper.(*wrapping.TestWrapper).SetKeyID(key)
}

// Set initial encryption key.
changeKey("kaz")

autoSeal := NewAutoSeal(testSeal)
autoSeal.SetCore(core)
pBackend := newTestBackend(t)
core.physical = pBackend
core.metricSink = metricsutil.NewClusterMetricSink("", inmemSink)

sealHealthTestIntervalNominal = 10 * time.Millisecond
sealHealthTestIntervalUnhealthy = 10 * time.Millisecond
autoSeal.StartHealthCheck()
*testErr = errors.New("disconnected")

time.Sleep(50 * time.Millisecond)

asu := strings.Join(autoSealUnavailableDuration, ".") + ";cluster="
intervals := inmemSink.Data()
if len(intervals) == 1 {
interval := inmemSink.Data()[0]

if _, ok := interval.Gauges[asu]; !ok {
t.Fatalf("Expected metrics to include a value for gauge %s", asu)
}
if interval.Gauges[asu].Value == 0 {
t.Fatalf("Expected value metric %s to be non-zero", asu)
}
}
*testErr = nil
time.Sleep(50 * time.Millisecond)
intervals = inmemSink.Data()
if len(intervals) == 1 {
interval := inmemSink.Data()[0]

if _, ok := interval.Gauges[asu]; !ok {
t.Fatalf("Expected metrics to include a value for gauge %s", asu)
}
if interval.Gauges[asu].Value != 0 {
t.Fatalf("Expected value metric %s to be non-zero", asu)
}
}
}