Add a periodic test of the autoseal to detect loss of connectivity. (#13078)
* Add a periodic test of the autoseal to detect loss of connectivity * Keep the logic adjacent to autoseal * imports * typo, plus unnecessary constant time compare * changelog * pr feedback * More feedback * Add locking and a unit test * unnecessary * Add timeouts to encrypt/decrypt operations, capture activeContext before starting loop * Add a block scope for the timeout * copy/paste ftl * Refactor to use two timeouts, and cleanup the repetitive failure code * Readd 0ing gauge * use millis * Invert the unit test logic
This commit is contained in:
parent
e6ffaaf835
commit
10270b6985
|
@ -0,0 +1,3 @@
|
||||||
|
```release-note:improvement
|
||||||
|
core: Periodically test the health of connectivity to auto-seal backends
|
||||||
|
```
|
|
@ -2143,6 +2143,10 @@ func (c *Core) postUnseal(ctx context.Context, ctxCancelFunc context.CancelFunc,
|
||||||
if err := seal.UpgradeKeys(c.activeContext); err != nil {
|
if err := seal.UpgradeKeys(c.activeContext); err != nil {
|
||||||
c.logger.Warn("post-unseal upgrade seal keys failed", "error", err)
|
c.logger.Warn("post-unseal upgrade seal keys failed", "error", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Start a periodic but infrequent heartbeat to detect auto-seal backend outages at runtime rather than being
|
||||||
|
// surprised by this at the next need to unseal.
|
||||||
|
seal.StartHealthCheck()
|
||||||
}
|
}
|
||||||
|
|
||||||
c.metricsCh = make(chan struct{})
|
c.metricsCh = make(chan struct{})
|
||||||
|
@ -2224,6 +2228,10 @@ func (c *Core) preSeal() error {
|
||||||
c.autoRotateCancel = nil
|
c.autoRotateCancel = nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if seal, ok := c.seal.(*autoSeal); ok {
|
||||||
|
seal.StopHealthCheck()
|
||||||
|
}
|
||||||
|
|
||||||
preSealPhysical(c)
|
preSealPhysical(c)
|
||||||
|
|
||||||
c.logger.Info("pre-seal teardown complete")
|
c.logger.Info("pre-seal teardown complete")
|
||||||
|
|
|
@ -1,11 +1,15 @@
|
||||||
package vault
|
package vault
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"crypto/subtle"
|
"crypto/subtle"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
mathrand "math/rand"
|
||||||
|
"sync"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
|
"time"
|
||||||
|
|
||||||
proto "github.com/golang/protobuf/proto"
|
proto "github.com/golang/protobuf/proto"
|
||||||
log "github.com/hashicorp/go-hclog"
|
log "github.com/hashicorp/go-hclog"
|
||||||
|
@ -16,7 +20,14 @@ import (
|
||||||
|
|
||||||
// barrierTypeUpgradeCheck checks for backwards compat on barrier type, not
|
// barrierTypeUpgradeCheck checks for backwards compat on barrier type, not
|
||||||
// applicable in the OSS side
|
// applicable in the OSS side
|
||||||
var barrierTypeUpgradeCheck = func(_ string, _ *SealConfig) {}
|
var (
|
||||||
|
barrierTypeUpgradeCheck = func(_ string, _ *SealConfig) {}
|
||||||
|
autoSealUnavailableDuration = []string{"seal", "unreachable", "time"}
|
||||||
|
// vars for unit testings
|
||||||
|
sealHealthTestIntervalNominal = 10 * time.Minute
|
||||||
|
sealHealthTestIntervalUnhealthy = 1 * time.Minute
|
||||||
|
sealHealthTestTimeout = 1 * time.Minute
|
||||||
|
)
|
||||||
|
|
||||||
// autoSeal is a Seal implementation that contains logic for encrypting and
|
// autoSeal is a Seal implementation that contains logic for encrypting and
|
||||||
// decrypting stored keys via an underlying AutoSealAccess implementation, as
|
// decrypting stored keys via an underlying AutoSealAccess implementation, as
|
||||||
|
@ -28,6 +39,9 @@ type autoSeal struct {
|
||||||
recoveryConfig atomic.Value
|
recoveryConfig atomic.Value
|
||||||
core *Core
|
core *Core
|
||||||
logger log.Logger
|
logger log.Logger
|
||||||
|
|
||||||
|
hcLock sync.Mutex
|
||||||
|
healthCheckStop chan struct{}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensure we are implementing the Seal interface
|
// Ensure we are implementing the Seal interface
|
||||||
|
@ -499,3 +513,82 @@ func (d *autoSeal) migrateRecoveryConfig(ctx context.Context) error {
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// StartHealthCheck starts a goroutine that tests the health of the auto-unseal backend once every 10 minutes.
|
||||||
|
// If unhealthy, logs a warning on the condition and begins testing every one minute until healthy again.
|
||||||
|
func (d *autoSeal) StartHealthCheck() {
|
||||||
|
d.StopHealthCheck()
|
||||||
|
d.hcLock.Lock()
|
||||||
|
defer d.hcLock.Unlock()
|
||||||
|
|
||||||
|
healthCheck := time.NewTicker(sealHealthTestIntervalNominal)
|
||||||
|
d.healthCheckStop = make(chan struct{})
|
||||||
|
healthCheckStop := d.healthCheckStop
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
ctx := d.core.activeContext
|
||||||
|
lastTestOk := true
|
||||||
|
lastSeenOk := time.Now()
|
||||||
|
|
||||||
|
fail := func(msg string, args ...interface{}) {
|
||||||
|
d.logger.Warn(msg, args...)
|
||||||
|
if lastTestOk {
|
||||||
|
healthCheck.Reset(sealHealthTestIntervalUnhealthy)
|
||||||
|
}
|
||||||
|
lastTestOk = false
|
||||||
|
d.core.MetricSink().SetGauge(autoSealUnavailableDuration, float32(time.Since(lastSeenOk).Milliseconds()))
|
||||||
|
}
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-healthCheckStop:
|
||||||
|
if healthCheck != nil {
|
||||||
|
healthCheck.Stop()
|
||||||
|
}
|
||||||
|
healthCheckStop = nil
|
||||||
|
return
|
||||||
|
case t := <-healthCheck.C:
|
||||||
|
func() {
|
||||||
|
ctx, cancel := context.WithTimeout(ctx, sealHealthTestTimeout)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
testVal := fmt.Sprintf("Heartbeat %d", mathrand.Intn(1000))
|
||||||
|
ciphertext, err := d.Access.Encrypt(ctx, []byte(testVal), nil)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
fail("failed to encrypt seal health test value, seal backend may be unreachable", "error", err)
|
||||||
|
} else {
|
||||||
|
func() {
|
||||||
|
ctx, cancel := context.WithTimeout(ctx, sealHealthTestTimeout)
|
||||||
|
defer cancel()
|
||||||
|
plaintext, err := d.Access.Decrypt(ctx, ciphertext, nil)
|
||||||
|
if err != nil {
|
||||||
|
fail("failed to decrypt seal health test value, seal backend may be unreachable", "error", err)
|
||||||
|
}
|
||||||
|
if !bytes.Equal([]byte(testVal), plaintext) {
|
||||||
|
fail("seal health test value failed to decrypt to expected value")
|
||||||
|
} else {
|
||||||
|
d.logger.Debug("seal health test passed")
|
||||||
|
if !lastTestOk {
|
||||||
|
d.logger.Info("seal backend is now healthy again", "downtime", t.Sub(lastSeenOk).String())
|
||||||
|
healthCheck.Reset(sealHealthTestIntervalNominal)
|
||||||
|
}
|
||||||
|
lastTestOk = true
|
||||||
|
lastSeenOk = t
|
||||||
|
d.core.MetricSink().SetGauge(autoSealUnavailableDuration, 0)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *autoSeal) StopHealthCheck() {
|
||||||
|
d.hcLock.Lock()
|
||||||
|
defer d.hcLock.Unlock()
|
||||||
|
if d.healthCheckStop != nil {
|
||||||
|
close(d.healthCheckStop)
|
||||||
|
d.healthCheckStop = nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -3,8 +3,13 @@ package vault
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
|
"errors"
|
||||||
|
"github.com/armon/go-metrics"
|
||||||
|
"github.com/hashicorp/vault/helper/metricsutil"
|
||||||
"reflect"
|
"reflect"
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
proto "github.com/golang/protobuf/proto"
|
proto "github.com/golang/protobuf/proto"
|
||||||
wrapping "github.com/hashicorp/go-kms-wrapping"
|
wrapping "github.com/hashicorp/go-kms-wrapping"
|
||||||
|
@ -157,3 +162,68 @@ func TestAutoSeal_UpgradeKeys(t *testing.T) {
|
||||||
}
|
}
|
||||||
check()
|
check()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestAutoSeal_HealthCheck(t *testing.T) {
|
||||||
|
inmemSink := metrics.NewInmemSink(
|
||||||
|
1000000*time.Hour,
|
||||||
|
2000000*time.Hour)
|
||||||
|
|
||||||
|
metricsConf := metrics.DefaultConfig("")
|
||||||
|
metricsConf.EnableHostname = false
|
||||||
|
metricsConf.EnableHostnameLabel = false
|
||||||
|
metricsConf.EnableServiceLabel = false
|
||||||
|
metricsConf.EnableTypePrefix = false
|
||||||
|
|
||||||
|
metrics.NewGlobal(metricsConf, inmemSink)
|
||||||
|
|
||||||
|
core, _, _ := TestCoreUnsealed(t)
|
||||||
|
testSeal, testErr := seal.NewToggleableTestSeal(nil)
|
||||||
|
|
||||||
|
var encKeys []string
|
||||||
|
changeKey := func(key string) {
|
||||||
|
encKeys = append(encKeys, key)
|
||||||
|
testSeal.Wrapper.(*seal.ToggleableWrapper).Wrapper.(*wrapping.TestWrapper).SetKeyID(key)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set initial encryption key.
|
||||||
|
changeKey("kaz")
|
||||||
|
|
||||||
|
autoSeal := NewAutoSeal(testSeal)
|
||||||
|
autoSeal.SetCore(core)
|
||||||
|
pBackend := newTestBackend(t)
|
||||||
|
core.physical = pBackend
|
||||||
|
core.metricSink = metricsutil.NewClusterMetricSink("", inmemSink)
|
||||||
|
|
||||||
|
sealHealthTestIntervalNominal = 10 * time.Millisecond
|
||||||
|
sealHealthTestIntervalUnhealthy = 10 * time.Millisecond
|
||||||
|
autoSeal.StartHealthCheck()
|
||||||
|
*testErr = errors.New("disconnected")
|
||||||
|
|
||||||
|
time.Sleep(50 * time.Millisecond)
|
||||||
|
|
||||||
|
asu := strings.Join(autoSealUnavailableDuration, ".") + ";cluster="
|
||||||
|
intervals := inmemSink.Data()
|
||||||
|
if len(intervals) == 1 {
|
||||||
|
interval := inmemSink.Data()[0]
|
||||||
|
|
||||||
|
if _, ok := interval.Gauges[asu]; !ok {
|
||||||
|
t.Fatalf("Expected metrics to include a value for gauge %s", asu)
|
||||||
|
}
|
||||||
|
if interval.Gauges[asu].Value == 0 {
|
||||||
|
t.Fatalf("Expected value metric %s to be non-zero", asu)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*testErr = nil
|
||||||
|
time.Sleep(50 * time.Millisecond)
|
||||||
|
intervals = inmemSink.Data()
|
||||||
|
if len(intervals) == 1 {
|
||||||
|
interval := inmemSink.Data()[0]
|
||||||
|
|
||||||
|
if _, ok := interval.Gauges[asu]; !ok {
|
||||||
|
t.Fatalf("Expected metrics to include a value for gauge %s", asu)
|
||||||
|
}
|
||||||
|
if interval.Gauges[asu].Value != 0 {
|
||||||
|
t.Fatalf("Expected value metric %s to be non-zero", asu)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue