Add a periodic test of the autoseal to detect loss of connectivity. (#13078)
* Add a periodic test of the autoseal to detect loss of connectivity * Keep the logic adjacent to autoseal * imports * typo, plus unnecessary constant time compare * changelog * pr feedback * More feedback * Add locking and a unit test * unnecessary * Add timeouts to encrypt/decrypt operations, capture activeContext before starting loop * Add a block scope for the timeout * copy/paste ftl * Refactor to use two timeouts, and cleanup the repetitive failure code * Readd 0ing gauge * use millis * Invert the unit test logic
This commit is contained in:
parent
e6ffaaf835
commit
10270b6985
|
@ -0,0 +1,3 @@
|
|||
```release-note:improvement
|
||||
core: Periodically test the health of connectivity to auto-seal backends
|
||||
```
|
|
@ -2143,6 +2143,10 @@ func (c *Core) postUnseal(ctx context.Context, ctxCancelFunc context.CancelFunc,
|
|||
if err := seal.UpgradeKeys(c.activeContext); err != nil {
|
||||
c.logger.Warn("post-unseal upgrade seal keys failed", "error", err)
|
||||
}
|
||||
|
||||
// Start a periodic but infrequent heartbeat to detect auto-seal backend outages at runtime rather than being
|
||||
// surprised by this at the next need to unseal.
|
||||
seal.StartHealthCheck()
|
||||
}
|
||||
|
||||
c.metricsCh = make(chan struct{})
|
||||
|
@ -2224,6 +2228,10 @@ func (c *Core) preSeal() error {
|
|||
c.autoRotateCancel = nil
|
||||
}
|
||||
|
||||
if seal, ok := c.seal.(*autoSeal); ok {
|
||||
seal.StopHealthCheck()
|
||||
}
|
||||
|
||||
preSealPhysical(c)
|
||||
|
||||
c.logger.Info("pre-seal teardown complete")
|
||||
|
|
|
@ -1,11 +1,15 @@
|
|||
package vault
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"crypto/subtle"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
mathrand "math/rand"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
proto "github.com/golang/protobuf/proto"
|
||||
log "github.com/hashicorp/go-hclog"
|
||||
|
@ -16,7 +20,14 @@ import (
|
|||
|
||||
// barrierTypeUpgradeCheck checks for backwards compat on barrier type, not
|
||||
// applicable in the OSS side
|
||||
var barrierTypeUpgradeCheck = func(_ string, _ *SealConfig) {}
|
||||
var (
|
||||
barrierTypeUpgradeCheck = func(_ string, _ *SealConfig) {}
|
||||
autoSealUnavailableDuration = []string{"seal", "unreachable", "time"}
|
||||
// vars for unit testings
|
||||
sealHealthTestIntervalNominal = 10 * time.Minute
|
||||
sealHealthTestIntervalUnhealthy = 1 * time.Minute
|
||||
sealHealthTestTimeout = 1 * time.Minute
|
||||
)
|
||||
|
||||
// autoSeal is a Seal implementation that contains logic for encrypting and
|
||||
// decrypting stored keys via an underlying AutoSealAccess implementation, as
|
||||
|
@ -28,6 +39,9 @@ type autoSeal struct {
|
|||
recoveryConfig atomic.Value
|
||||
core *Core
|
||||
logger log.Logger
|
||||
|
||||
hcLock sync.Mutex
|
||||
healthCheckStop chan struct{}
|
||||
}
|
||||
|
||||
// Ensure we are implementing the Seal interface
|
||||
|
@ -499,3 +513,82 @@ func (d *autoSeal) migrateRecoveryConfig(ctx context.Context) error {
|
|||
|
||||
return nil
|
||||
}
|
||||
|
||||
// StartHealthCheck starts a goroutine that tests the health of the auto-unseal backend once every 10 minutes.
|
||||
// If unhealthy, logs a warning on the condition and begins testing every one minute until healthy again.
|
||||
func (d *autoSeal) StartHealthCheck() {
|
||||
d.StopHealthCheck()
|
||||
d.hcLock.Lock()
|
||||
defer d.hcLock.Unlock()
|
||||
|
||||
healthCheck := time.NewTicker(sealHealthTestIntervalNominal)
|
||||
d.healthCheckStop = make(chan struct{})
|
||||
healthCheckStop := d.healthCheckStop
|
||||
|
||||
go func() {
|
||||
ctx := d.core.activeContext
|
||||
lastTestOk := true
|
||||
lastSeenOk := time.Now()
|
||||
|
||||
fail := func(msg string, args ...interface{}) {
|
||||
d.logger.Warn(msg, args...)
|
||||
if lastTestOk {
|
||||
healthCheck.Reset(sealHealthTestIntervalUnhealthy)
|
||||
}
|
||||
lastTestOk = false
|
||||
d.core.MetricSink().SetGauge(autoSealUnavailableDuration, float32(time.Since(lastSeenOk).Milliseconds()))
|
||||
}
|
||||
for {
|
||||
select {
|
||||
case <-healthCheckStop:
|
||||
if healthCheck != nil {
|
||||
healthCheck.Stop()
|
||||
}
|
||||
healthCheckStop = nil
|
||||
return
|
||||
case t := <-healthCheck.C:
|
||||
func() {
|
||||
ctx, cancel := context.WithTimeout(ctx, sealHealthTestTimeout)
|
||||
defer cancel()
|
||||
|
||||
testVal := fmt.Sprintf("Heartbeat %d", mathrand.Intn(1000))
|
||||
ciphertext, err := d.Access.Encrypt(ctx, []byte(testVal), nil)
|
||||
|
||||
if err != nil {
|
||||
fail("failed to encrypt seal health test value, seal backend may be unreachable", "error", err)
|
||||
} else {
|
||||
func() {
|
||||
ctx, cancel := context.WithTimeout(ctx, sealHealthTestTimeout)
|
||||
defer cancel()
|
||||
plaintext, err := d.Access.Decrypt(ctx, ciphertext, nil)
|
||||
if err != nil {
|
||||
fail("failed to decrypt seal health test value, seal backend may be unreachable", "error", err)
|
||||
}
|
||||
if !bytes.Equal([]byte(testVal), plaintext) {
|
||||
fail("seal health test value failed to decrypt to expected value")
|
||||
} else {
|
||||
d.logger.Debug("seal health test passed")
|
||||
if !lastTestOk {
|
||||
d.logger.Info("seal backend is now healthy again", "downtime", t.Sub(lastSeenOk).String())
|
||||
healthCheck.Reset(sealHealthTestIntervalNominal)
|
||||
}
|
||||
lastTestOk = true
|
||||
lastSeenOk = t
|
||||
d.core.MetricSink().SetGauge(autoSealUnavailableDuration, 0)
|
||||
}
|
||||
}()
|
||||
}
|
||||
}()
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
func (d *autoSeal) StopHealthCheck() {
|
||||
d.hcLock.Lock()
|
||||
defer d.hcLock.Unlock()
|
||||
if d.healthCheckStop != nil {
|
||||
close(d.healthCheckStop)
|
||||
d.healthCheckStop = nil
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,8 +3,13 @@ package vault
|
|||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"errors"
|
||||
"github.com/armon/go-metrics"
|
||||
"github.com/hashicorp/vault/helper/metricsutil"
|
||||
"reflect"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
proto "github.com/golang/protobuf/proto"
|
||||
wrapping "github.com/hashicorp/go-kms-wrapping"
|
||||
|
@ -157,3 +162,68 @@ func TestAutoSeal_UpgradeKeys(t *testing.T) {
|
|||
}
|
||||
check()
|
||||
}
|
||||
|
||||
func TestAutoSeal_HealthCheck(t *testing.T) {
|
||||
inmemSink := metrics.NewInmemSink(
|
||||
1000000*time.Hour,
|
||||
2000000*time.Hour)
|
||||
|
||||
metricsConf := metrics.DefaultConfig("")
|
||||
metricsConf.EnableHostname = false
|
||||
metricsConf.EnableHostnameLabel = false
|
||||
metricsConf.EnableServiceLabel = false
|
||||
metricsConf.EnableTypePrefix = false
|
||||
|
||||
metrics.NewGlobal(metricsConf, inmemSink)
|
||||
|
||||
core, _, _ := TestCoreUnsealed(t)
|
||||
testSeal, testErr := seal.NewToggleableTestSeal(nil)
|
||||
|
||||
var encKeys []string
|
||||
changeKey := func(key string) {
|
||||
encKeys = append(encKeys, key)
|
||||
testSeal.Wrapper.(*seal.ToggleableWrapper).Wrapper.(*wrapping.TestWrapper).SetKeyID(key)
|
||||
}
|
||||
|
||||
// Set initial encryption key.
|
||||
changeKey("kaz")
|
||||
|
||||
autoSeal := NewAutoSeal(testSeal)
|
||||
autoSeal.SetCore(core)
|
||||
pBackend := newTestBackend(t)
|
||||
core.physical = pBackend
|
||||
core.metricSink = metricsutil.NewClusterMetricSink("", inmemSink)
|
||||
|
||||
sealHealthTestIntervalNominal = 10 * time.Millisecond
|
||||
sealHealthTestIntervalUnhealthy = 10 * time.Millisecond
|
||||
autoSeal.StartHealthCheck()
|
||||
*testErr = errors.New("disconnected")
|
||||
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
|
||||
asu := strings.Join(autoSealUnavailableDuration, ".") + ";cluster="
|
||||
intervals := inmemSink.Data()
|
||||
if len(intervals) == 1 {
|
||||
interval := inmemSink.Data()[0]
|
||||
|
||||
if _, ok := interval.Gauges[asu]; !ok {
|
||||
t.Fatalf("Expected metrics to include a value for gauge %s", asu)
|
||||
}
|
||||
if interval.Gauges[asu].Value == 0 {
|
||||
t.Fatalf("Expected value metric %s to be non-zero", asu)
|
||||
}
|
||||
}
|
||||
*testErr = nil
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
intervals = inmemSink.Data()
|
||||
if len(intervals) == 1 {
|
||||
interval := inmemSink.Data()[0]
|
||||
|
||||
if _, ok := interval.Gauges[asu]; !ok {
|
||||
t.Fatalf("Expected metrics to include a value for gauge %s", asu)
|
||||
}
|
||||
if interval.Gauges[asu].Value != 0 {
|
||||
t.Fatalf("Expected value metric %s to be non-zero", asu)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue