Add a periodic test of the autoseal to detect loss of connectivity. (#13078)

* Add a periodic test of the autoseal to detect loss of connectivity

* Keep the logic adjacent to autoseal

* imports

* typo, plus unnecessary constant time compare

* changelog

* pr feedback

* More feedback

* Add locking and a unit test

* unnecessary

* Add timeouts to encrypt/decrypt operations, capture activeContext before starting loop

* Add a block scope for the timeout

* copy/paste ftl

* Refactor to use two timeouts, and cleanup the repetitive failure code

* Readd 0ing gauge

* use millis

* Invert the unit test logic
This commit is contained in:
Scott Miller 2021-11-10 14:46:07 -06:00 committed by GitHub
parent e6ffaaf835
commit 10270b6985
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 175 additions and 1 deletions

3
changelog/13078.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:improvement
core: Periodically test the health of connectivity to auto-seal backends
```

View File

@ -2143,6 +2143,10 @@ func (c *Core) postUnseal(ctx context.Context, ctxCancelFunc context.CancelFunc,
if err := seal.UpgradeKeys(c.activeContext); err != nil {
c.logger.Warn("post-unseal upgrade seal keys failed", "error", err)
}
// Start a periodic but infrequent heartbeat to detect auto-seal backend outages at runtime rather than being
// surprised by this at the next need to unseal.
seal.StartHealthCheck()
}
c.metricsCh = make(chan struct{})
@ -2224,6 +2228,10 @@ func (c *Core) preSeal() error {
c.autoRotateCancel = nil
}
if seal, ok := c.seal.(*autoSeal); ok {
seal.StopHealthCheck()
}
preSealPhysical(c)
c.logger.Info("pre-seal teardown complete")

View File

@ -1,11 +1,15 @@
package vault
import (
"bytes"
"context"
"crypto/subtle"
"encoding/json"
"fmt"
mathrand "math/rand"
"sync"
"sync/atomic"
"time"
proto "github.com/golang/protobuf/proto"
log "github.com/hashicorp/go-hclog"
@ -16,7 +20,14 @@ import (
// barrierTypeUpgradeCheck checks for backwards compat on barrier type, not
// applicable in the OSS side
var barrierTypeUpgradeCheck = func(_ string, _ *SealConfig) {}
var (
barrierTypeUpgradeCheck = func(_ string, _ *SealConfig) {}
autoSealUnavailableDuration = []string{"seal", "unreachable", "time"}
// vars for unit testings
sealHealthTestIntervalNominal = 10 * time.Minute
sealHealthTestIntervalUnhealthy = 1 * time.Minute
sealHealthTestTimeout = 1 * time.Minute
)
// autoSeal is a Seal implementation that contains logic for encrypting and
// decrypting stored keys via an underlying AutoSealAccess implementation, as
@ -28,6 +39,9 @@ type autoSeal struct {
recoveryConfig atomic.Value
core *Core
logger log.Logger
hcLock sync.Mutex
healthCheckStop chan struct{}
}
// Ensure we are implementing the Seal interface
@ -499,3 +513,82 @@ func (d *autoSeal) migrateRecoveryConfig(ctx context.Context) error {
return nil
}
// StartHealthCheck starts a goroutine that tests the health of the auto-unseal backend once every 10 minutes.
// If unhealthy, logs a warning on the condition and begins testing every one minute until healthy again.
func (d *autoSeal) StartHealthCheck() {
d.StopHealthCheck()
d.hcLock.Lock()
defer d.hcLock.Unlock()
healthCheck := time.NewTicker(sealHealthTestIntervalNominal)
d.healthCheckStop = make(chan struct{})
healthCheckStop := d.healthCheckStop
go func() {
ctx := d.core.activeContext
lastTestOk := true
lastSeenOk := time.Now()
fail := func(msg string, args ...interface{}) {
d.logger.Warn(msg, args...)
if lastTestOk {
healthCheck.Reset(sealHealthTestIntervalUnhealthy)
}
lastTestOk = false
d.core.MetricSink().SetGauge(autoSealUnavailableDuration, float32(time.Since(lastSeenOk).Milliseconds()))
}
for {
select {
case <-healthCheckStop:
if healthCheck != nil {
healthCheck.Stop()
}
healthCheckStop = nil
return
case t := <-healthCheck.C:
func() {
ctx, cancel := context.WithTimeout(ctx, sealHealthTestTimeout)
defer cancel()
testVal := fmt.Sprintf("Heartbeat %d", mathrand.Intn(1000))
ciphertext, err := d.Access.Encrypt(ctx, []byte(testVal), nil)
if err != nil {
fail("failed to encrypt seal health test value, seal backend may be unreachable", "error", err)
} else {
func() {
ctx, cancel := context.WithTimeout(ctx, sealHealthTestTimeout)
defer cancel()
plaintext, err := d.Access.Decrypt(ctx, ciphertext, nil)
if err != nil {
fail("failed to decrypt seal health test value, seal backend may be unreachable", "error", err)
}
if !bytes.Equal([]byte(testVal), plaintext) {
fail("seal health test value failed to decrypt to expected value")
} else {
d.logger.Debug("seal health test passed")
if !lastTestOk {
d.logger.Info("seal backend is now healthy again", "downtime", t.Sub(lastSeenOk).String())
healthCheck.Reset(sealHealthTestIntervalNominal)
}
lastTestOk = true
lastSeenOk = t
d.core.MetricSink().SetGauge(autoSealUnavailableDuration, 0)
}
}()
}
}()
}
}
}()
}
func (d *autoSeal) StopHealthCheck() {
d.hcLock.Lock()
defer d.hcLock.Unlock()
if d.healthCheckStop != nil {
close(d.healthCheckStop)
d.healthCheckStop = nil
}
}

View File

@ -3,8 +3,13 @@ package vault
import (
"bytes"
"context"
"errors"
"github.com/armon/go-metrics"
"github.com/hashicorp/vault/helper/metricsutil"
"reflect"
"strings"
"testing"
"time"
proto "github.com/golang/protobuf/proto"
wrapping "github.com/hashicorp/go-kms-wrapping"
@ -157,3 +162,68 @@ func TestAutoSeal_UpgradeKeys(t *testing.T) {
}
check()
}
func TestAutoSeal_HealthCheck(t *testing.T) {
inmemSink := metrics.NewInmemSink(
1000000*time.Hour,
2000000*time.Hour)
metricsConf := metrics.DefaultConfig("")
metricsConf.EnableHostname = false
metricsConf.EnableHostnameLabel = false
metricsConf.EnableServiceLabel = false
metricsConf.EnableTypePrefix = false
metrics.NewGlobal(metricsConf, inmemSink)
core, _, _ := TestCoreUnsealed(t)
testSeal, testErr := seal.NewToggleableTestSeal(nil)
var encKeys []string
changeKey := func(key string) {
encKeys = append(encKeys, key)
testSeal.Wrapper.(*seal.ToggleableWrapper).Wrapper.(*wrapping.TestWrapper).SetKeyID(key)
}
// Set initial encryption key.
changeKey("kaz")
autoSeal := NewAutoSeal(testSeal)
autoSeal.SetCore(core)
pBackend := newTestBackend(t)
core.physical = pBackend
core.metricSink = metricsutil.NewClusterMetricSink("", inmemSink)
sealHealthTestIntervalNominal = 10 * time.Millisecond
sealHealthTestIntervalUnhealthy = 10 * time.Millisecond
autoSeal.StartHealthCheck()
*testErr = errors.New("disconnected")
time.Sleep(50 * time.Millisecond)
asu := strings.Join(autoSealUnavailableDuration, ".") + ";cluster="
intervals := inmemSink.Data()
if len(intervals) == 1 {
interval := inmemSink.Data()[0]
if _, ok := interval.Gauges[asu]; !ok {
t.Fatalf("Expected metrics to include a value for gauge %s", asu)
}
if interval.Gauges[asu].Value == 0 {
t.Fatalf("Expected value metric %s to be non-zero", asu)
}
}
*testErr = nil
time.Sleep(50 * time.Millisecond)
intervals = inmemSink.Data()
if len(intervals) == 1 {
interval := inmemSink.Data()[0]
if _, ok := interval.Gauges[asu]; !ok {
t.Fatalf("Expected metrics to include a value for gauge %s", asu)
}
if interval.Gauges[asu].Value != 0 {
t.Fatalf("Expected value metric %s to be non-zero", asu)
}
}
}