diff --git a/changelog/13078.txt b/changelog/13078.txt new file mode 100644 index 000000000..789445b63 --- /dev/null +++ b/changelog/13078.txt @@ -0,0 +1,3 @@ +```release-note:improvement +core: Periodically test the health of connectivity to auto-seal backends +``` \ No newline at end of file diff --git a/vault/core.go b/vault/core.go index 77408ef08..8ae95f6c7 100644 --- a/vault/core.go +++ b/vault/core.go @@ -2143,6 +2143,10 @@ func (c *Core) postUnseal(ctx context.Context, ctxCancelFunc context.CancelFunc, if err := seal.UpgradeKeys(c.activeContext); err != nil { c.logger.Warn("post-unseal upgrade seal keys failed", "error", err) } + + // Start a periodic but infrequent heartbeat to detect auto-seal backend outages at runtime rather than being + // surprised by this at the next need to unseal. + seal.StartHealthCheck() } c.metricsCh = make(chan struct{}) @@ -2224,6 +2228,10 @@ func (c *Core) preSeal() error { c.autoRotateCancel = nil } + if seal, ok := c.seal.(*autoSeal); ok { + seal.StopHealthCheck() + } + preSealPhysical(c) c.logger.Info("pre-seal teardown complete") diff --git a/vault/seal_autoseal.go b/vault/seal_autoseal.go index a037bc866..8d064a8f9 100644 --- a/vault/seal_autoseal.go +++ b/vault/seal_autoseal.go @@ -1,11 +1,15 @@ package vault import ( + "bytes" "context" "crypto/subtle" "encoding/json" "fmt" + mathrand "math/rand" + "sync" "sync/atomic" + "time" proto "github.com/golang/protobuf/proto" log "github.com/hashicorp/go-hclog" @@ -16,7 +20,14 @@ import ( // barrierTypeUpgradeCheck checks for backwards compat on barrier type, not // applicable in the OSS side -var barrierTypeUpgradeCheck = func(_ string, _ *SealConfig) {} +var ( + barrierTypeUpgradeCheck = func(_ string, _ *SealConfig) {} + autoSealUnavailableDuration = []string{"seal", "unreachable", "time"} + // vars for unit testings + sealHealthTestIntervalNominal = 10 * time.Minute + sealHealthTestIntervalUnhealthy = 1 * time.Minute + sealHealthTestTimeout = 1 * time.Minute +) // autoSeal is a Seal implementation that contains logic for encrypting and // decrypting stored keys via an underlying AutoSealAccess implementation, as @@ -28,6 +39,9 @@ type autoSeal struct { recoveryConfig atomic.Value core *Core logger log.Logger + + hcLock sync.Mutex + healthCheckStop chan struct{} } // Ensure we are implementing the Seal interface @@ -499,3 +513,82 @@ func (d *autoSeal) migrateRecoveryConfig(ctx context.Context) error { return nil } + +// StartHealthCheck starts a goroutine that tests the health of the auto-unseal backend once every 10 minutes. +// If unhealthy, logs a warning on the condition and begins testing every one minute until healthy again. +func (d *autoSeal) StartHealthCheck() { + d.StopHealthCheck() + d.hcLock.Lock() + defer d.hcLock.Unlock() + + healthCheck := time.NewTicker(sealHealthTestIntervalNominal) + d.healthCheckStop = make(chan struct{}) + healthCheckStop := d.healthCheckStop + + go func() { + ctx := d.core.activeContext + lastTestOk := true + lastSeenOk := time.Now() + + fail := func(msg string, args ...interface{}) { + d.logger.Warn(msg, args...) + if lastTestOk { + healthCheck.Reset(sealHealthTestIntervalUnhealthy) + } + lastTestOk = false + d.core.MetricSink().SetGauge(autoSealUnavailableDuration, float32(time.Since(lastSeenOk).Milliseconds())) + } + for { + select { + case <-healthCheckStop: + if healthCheck != nil { + healthCheck.Stop() + } + healthCheckStop = nil + return + case t := <-healthCheck.C: + func() { + ctx, cancel := context.WithTimeout(ctx, sealHealthTestTimeout) + defer cancel() + + testVal := fmt.Sprintf("Heartbeat %d", mathrand.Intn(1000)) + ciphertext, err := d.Access.Encrypt(ctx, []byte(testVal), nil) + + if err != nil { + fail("failed to encrypt seal health test value, seal backend may be unreachable", "error", err) + } else { + func() { + ctx, cancel := context.WithTimeout(ctx, sealHealthTestTimeout) + defer cancel() + plaintext, err := d.Access.Decrypt(ctx, ciphertext, nil) + if err != nil { + fail("failed to decrypt seal health test value, seal backend may be unreachable", "error", err) + } + if !bytes.Equal([]byte(testVal), plaintext) { + fail("seal health test value failed to decrypt to expected value") + } else { + d.logger.Debug("seal health test passed") + if !lastTestOk { + d.logger.Info("seal backend is now healthy again", "downtime", t.Sub(lastSeenOk).String()) + healthCheck.Reset(sealHealthTestIntervalNominal) + } + lastTestOk = true + lastSeenOk = t + d.core.MetricSink().SetGauge(autoSealUnavailableDuration, 0) + } + }() + } + }() + } + } + }() +} + +func (d *autoSeal) StopHealthCheck() { + d.hcLock.Lock() + defer d.hcLock.Unlock() + if d.healthCheckStop != nil { + close(d.healthCheckStop) + d.healthCheckStop = nil + } +} diff --git a/vault/seal_autoseal_test.go b/vault/seal_autoseal_test.go index 9c9827055..24e49ea74 100644 --- a/vault/seal_autoseal_test.go +++ b/vault/seal_autoseal_test.go @@ -3,8 +3,13 @@ package vault import ( "bytes" "context" + "errors" + "github.com/armon/go-metrics" + "github.com/hashicorp/vault/helper/metricsutil" "reflect" + "strings" "testing" + "time" proto "github.com/golang/protobuf/proto" wrapping "github.com/hashicorp/go-kms-wrapping" @@ -157,3 +162,68 @@ func TestAutoSeal_UpgradeKeys(t *testing.T) { } check() } + +func TestAutoSeal_HealthCheck(t *testing.T) { + inmemSink := metrics.NewInmemSink( + 1000000*time.Hour, + 2000000*time.Hour) + + metricsConf := metrics.DefaultConfig("") + metricsConf.EnableHostname = false + metricsConf.EnableHostnameLabel = false + metricsConf.EnableServiceLabel = false + metricsConf.EnableTypePrefix = false + + metrics.NewGlobal(metricsConf, inmemSink) + + core, _, _ := TestCoreUnsealed(t) + testSeal, testErr := seal.NewToggleableTestSeal(nil) + + var encKeys []string + changeKey := func(key string) { + encKeys = append(encKeys, key) + testSeal.Wrapper.(*seal.ToggleableWrapper).Wrapper.(*wrapping.TestWrapper).SetKeyID(key) + } + + // Set initial encryption key. + changeKey("kaz") + + autoSeal := NewAutoSeal(testSeal) + autoSeal.SetCore(core) + pBackend := newTestBackend(t) + core.physical = pBackend + core.metricSink = metricsutil.NewClusterMetricSink("", inmemSink) + + sealHealthTestIntervalNominal = 10 * time.Millisecond + sealHealthTestIntervalUnhealthy = 10 * time.Millisecond + autoSeal.StartHealthCheck() + *testErr = errors.New("disconnected") + + time.Sleep(50 * time.Millisecond) + + asu := strings.Join(autoSealUnavailableDuration, ".") + ";cluster=" + intervals := inmemSink.Data() + if len(intervals) == 1 { + interval := inmemSink.Data()[0] + + if _, ok := interval.Gauges[asu]; !ok { + t.Fatalf("Expected metrics to include a value for gauge %s", asu) + } + if interval.Gauges[asu].Value == 0 { + t.Fatalf("Expected value metric %s to be non-zero", asu) + } + } + *testErr = nil + time.Sleep(50 * time.Millisecond) + intervals = inmemSink.Data() + if len(intervals) == 1 { + interval := inmemSink.Data()[0] + + if _, ok := interval.Gauges[asu]; !ok { + t.Fatalf("Expected metrics to include a value for gauge %s", asu) + } + if interval.Gauges[asu].Value != 0 { + t.Fatalf("Expected value metric %s to be non-zero", asu) + } + } +}