Add auto-tidy, last-tidy, and leaf cert health checks (#17901)

* Add enable_auto_tidy health check Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Add tidy_last_run health check Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Add too_many_certs health check Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Add tidy, CRL, cert count checks to CLI Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Cache stored leaf cert count Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Correctly parse last run Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>
2022-11-18 11:04:58 -05:00 · 2022-11-18 11:04:58 -05:00 · 8461f096e2
parent 9543067ffe
commit 8461f096e2
5 changed files with 414 additions and 0 deletions
--- a/command/healthcheck/pki.go
+++ b/command/healthcheck/pki.go
@ -198,6 +198,7 @@ func pkiFetchLeaves(e *Executor, versionError func()) (bool, *PathFetch, []strin
 			leaves = append(leaves, rawSerial.(string))
 		}
 		leavesRet.ParsedCache["leaves"] = leaves
+		leavesRet.ParsedCache["count"] = len(leaves)
 	}

 	return false, leavesRet, leavesRet.ParsedCache["leaves"].([]string), nil
--- a/command/healthcheck/pki_enable_auto_tidy.go
+++ b/command/healthcheck/pki_enable_auto_tidy.go
@ -0,0 +1,185 @@
+package healthcheck
+
+import (
+	"fmt"
+	"time"
+
+	"github.com/hashicorp/vault/sdk/logical"
+
+	"github.com/hashicorp/go-secure-stdlib/parseutil"
+)
+
+type EnableAutoTidy struct {
+	Enabled            bool
+	UnsupportedVersion bool
+
+	IntervalDurationCritical time.Duration
+	IntervalDurationWarning  time.Duration
+	PauseDurationCritical    time.Duration
+	PauseDurationWarning     time.Duration
+
+	TidyConfig *PathFetch
+}
+
+func NewEnableAutoTidyCheck() Check {
+	return &EnableAutoTidy{}
+}
+
+func (h *EnableAutoTidy) Name() string {
+	return "enable_auto_tidy"
+}
+
+func (h *EnableAutoTidy) IsEnabled() bool {
+	return h.Enabled
+}
+
+func (h *EnableAutoTidy) DefaultConfig() map[string]interface{} {
+	return map[string]interface{}{
+		"interval_duration_critical": "7d",
+		"interval_duration_warning":  "2d",
+		"pause_duration_critical":    "1s",
+		"pause_duration_warning":     "200ms",
+	}
+}
+
+func (h *EnableAutoTidy) fromConfig(config map[string]interface{}, param string) (time.Duration, error) {
+	value, err := parseutil.ParseDurationSecond(config[param])
+	if err != nil {
+		return time.Duration(0), fmt.Errorf("failed to parse parameter %v.%v=%v: %w", h.Name(), param, config[param], err)
+	}
+
+	return value, nil
+}
+
+func (h *EnableAutoTidy) LoadConfig(config map[string]interface{}) error {
+	var err error
+
+	h.IntervalDurationCritical, err = h.fromConfig(config, "interval_duration_critical")
+	if err != nil {
+		return err
+	}
+
+	h.IntervalDurationWarning, err = h.fromConfig(config, "interval_duration_warning")
+	if err != nil {
+		return err
+	}
+
+	h.PauseDurationCritical, err = h.fromConfig(config, "pause_duration_critical")
+	if err != nil {
+		return err
+	}
+
+	h.PauseDurationWarning, err = h.fromConfig(config, "pause_duration_warning")
+	if err != nil {
+		return err
+	}
+
+	enabled, err := parseutil.ParseBool(config["enabled"])
+	if err != nil {
+		return fmt.Errorf("error parsing %v.enabled: %w", h.Name(), err)
+	}
+	h.Enabled = enabled
+
+	return nil
+}
+
+func (h *EnableAutoTidy) FetchResources(e *Executor) error {
+	var err error
+	h.TidyConfig, err = e.FetchIfNotFetched(logical.ReadOperation, "/{{mount}}/config/auto-tidy")
+	if err != nil {
+		return err
+	}
+
+	if h.TidyConfig.IsUnsupportedPathError() {
+		h.UnsupportedVersion = true
+	}
+
+	return nil
+}
+
+func (h *EnableAutoTidy) Evaluate(e *Executor) (results []*Result, err error) {
+	if h.UnsupportedVersion {
+		ret := Result{
+			Status:   ResultInvalidVersion,
+			Endpoint: "/{{mount}}/config/auto-tidy",
+			Message:  "This health check requires Vault 1.12+, but an earlier version of Vault Server was contacted, preventing this health check from running.",
+		}
+		return []*Result{&ret}, nil
+	}
+
+	if h.TidyConfig == nil {
+		return
+	}
+
+	if h.TidyConfig.IsSecretPermissionsError() {
+		ret := Result{
+			Status:   ResultInsufficientPermissions,
+			Endpoint: "/{{mount}}/config/auto-tidy",
+			Message:  "This prevents the health check from functioning at all, as it cannot .",
+		}
+
+		if e.Client.Token() == "" {
+			ret.Message = "No token available so unable read authenticated auto-tidy configuration for this mount. " + ret.Message
+		} else {
+			ret.Message = "This token lacks permission to read the auto-tidy configuration for this mount. " + ret.Message
+		}
+
+		return []*Result{&ret}, nil
+	}
+
+	isEnabled := h.TidyConfig.Secret.Data["enabled"].(bool)
+	intervalDuration, err := parseutil.ParseDurationSecond(h.TidyConfig.Secret.Data["interval_duration"])
+	if err != nil {
+		return nil, fmt.Errorf("error parsing API response from server for interval_duration: %w", err)
+	}
+
+	pauseDuration, err := parseutil.ParseDurationSecond(h.TidyConfig.Secret.Data["pause_duration"])
+	if err != nil {
+		return nil, fmt.Errorf("error parsing API response from server for pause_duration: %w", err)
+	}
+
+	if !isEnabled {
+		ret := Result{
+			Status:   ResultInformational,
+			Endpoint: "/{{mount}}/config/auto-tidy",
+			Message:  "Auto-tidy is currently disabled; consider enabling auto-tidy to execute tidy operations periodically. This helps the health and performance of a mount.",
+		}
+		results = append(results, &ret)
+	} else {
+		baseMsg := "Auto-tidy is configured with too long of a value for %v (%v); this could impact performance as tidies run too infrequently or take too long to execute."
+
+		if intervalDuration >= h.IntervalDurationCritical {
+			ret := Result{
+				Status:   ResultCritical,
+				Endpoint: "/{{mount}}/config/auto-tidy",
+				Message:  fmt.Sprintf(baseMsg, "interval_duration", intervalDuration),
+			}
+			results = append(results, &ret)
+		} else if intervalDuration >= h.IntervalDurationWarning {
+			ret := Result{
+				Status:   ResultWarning,
+				Endpoint: "/{{mount}}/config/auto-tidy",
+				Message:  fmt.Sprintf(baseMsg, "interval_duration", intervalDuration),
+			}
+			results = append(results, &ret)
+		}
+
+		if pauseDuration >= h.PauseDurationCritical {
+			ret := Result{
+				Status:   ResultCritical,
+				Endpoint: "/{{mount}}/config/auto-tidy",
+				Message:  fmt.Sprintf(baseMsg, "pause_duration", pauseDuration),
+			}
+			results = append(results, &ret)
+		} else if pauseDuration >= h.PauseDurationWarning {
+			ret := Result{
+				Status:   ResultWarning,
+				Endpoint: "/{{mount}}/config/auto-tidy",
+				Message:  fmt.Sprintf(baseMsg, "pause_duration", pauseDuration),
+			}
+			results = append(results, &ret)
+		}
+	}
+
+	return
+}
--- a/command/healthcheck/pki_tidy_last_run.go
+++ b/command/healthcheck/pki_tidy_last_run.go
@ -0,0 +1,124 @@
+package healthcheck
+
+import (
+	"fmt"
+	"time"
+
+	"github.com/hashicorp/vault/sdk/logical"
+
+	"github.com/hashicorp/go-secure-stdlib/parseutil"
+)
+
+type TidyLastRun struct {
+	Enabled            bool
+	UnsupportedVersion bool
+
+	LastRunCritical time.Duration
+	LastRunWarning  time.Duration
+
+	TidyStatus *PathFetch
+}
+
+func NewTidyLastRunCheck() Check {
+	return &TidyLastRun{}
+}
+
+func (h *TidyLastRun) Name() string {
+	return "tidy_last_run"
+}
+
+func (h *TidyLastRun) IsEnabled() bool {
+	return h.Enabled
+}
+
+func (h *TidyLastRun) DefaultConfig() map[string]interface{} {
+	return map[string]interface{}{
+		"last_run_critical": "7d",
+		"last_run_warning":  "2d",
+	}
+}
+
+func (h *TidyLastRun) LoadConfig(config map[string]interface{}) error {
+	var err error
+	h.LastRunCritical, err = parseutil.ParseDurationSecond(config["last_run_critical"])
+	if err != nil {
+		return fmt.Errorf("failed to parse parameter %v.%v=%v: %w", h.Name(), "last_run_critical", config["last_run_critical"], err)
+	}
+
+	h.LastRunWarning, err = parseutil.ParseDurationSecond(config["last_run_warning"])
+	if err != nil {
+		return fmt.Errorf("failed to parse parameter %v.%v=%v: %w", h.Name(), "last_run_warning", config["last_run_warning"], err)
+	}
+
+	enabled, err := parseutil.ParseBool(config["enabled"])
+	if err != nil {
+		return fmt.Errorf("error parsing %v.enabled: %w", h.Name(), err)
+	}
+	h.Enabled = enabled
+
+	return nil
+}
+
+func (h *TidyLastRun) FetchResources(e *Executor) error {
+	var err error
+
+	h.TidyStatus, err = e.FetchIfNotFetched(logical.ReadOperation, "/{{mount}}/tidy-status")
+	if err != nil {
+		return fmt.Errorf("failed to fetch mount's tidy-status value: %v", err)
+	}
+
+	if h.TidyStatus.IsUnsupportedPathError() {
+		h.UnsupportedVersion = true
+	}
+
+	return nil
+}
+
+func (h *TidyLastRun) Evaluate(e *Executor) (results []*Result, err error) {
+	if h.UnsupportedVersion {
+		// Shouldn't happen; roles have been around forever.
+		ret := Result{
+			Status:   ResultInvalidVersion,
+			Endpoint: "/{{mount}}/tidy-status",
+			Message:  "This health check requires Vault 1.10+ but an earlier version of Vault Server was contacted, preventing this health check from running.",
+		}
+		return []*Result{&ret}, nil
+	}
+
+	baseMsg := "Tidy hasn't run in the last %v; this can point to problems with the mount's auto-tidy configuration or an external tidy executor; this can impact PKI's and Vault's performance if not run regularly."
+
+	ret := Result{
+		Status:   ResultOK,
+		Endpoint: "/{{mount}}/tidy-status",
+		Message:  "Tidy has run recently on this mount.",
+	}
+
+	if h.TidyStatus.Secret != nil && h.TidyStatus.Secret.Data != nil {
+		when := h.TidyStatus.Secret.Data["time_finished"]
+		if when == nil {
+			ret.Status = ResultCritical
+			ret.Message = "Tidy hasn't run since this mount was created; this can point to problems with the mount's auto-tidy configuration or an external tidy executor; this can impact PKI's and Vault's performance if not run regularly. It is suggested to enable auto-tidy on this mount."
+		} else {
+			now := time.Now()
+			lastRunCritical := now.Add(-1 * h.LastRunCritical)
+			lastRunWarning := now.Add(-1 * h.LastRunWarning)
+
+			whenT, err := parseutil.ParseAbsoluteTime(when)
+			if err != nil {
+				return nil, fmt.Errorf("error parsing time value (%v): %w", when, err)
+			}
+
+			if whenT.Before(lastRunCritical) {
+				ret.Status = ResultCritical
+				ret.Message = fmt.Sprintf(baseMsg, h.LastRunCritical)
+			} else if whenT.Before(lastRunWarning) {
+				ret.Status = ResultWarning
+				ret.Message = fmt.Sprintf(baseMsg, h.LastRunWarning)
+			}
+		}
+	}
+
+	results = append(results, &ret)
+
+	return
+}
--- a/command/healthcheck/pki_too_many_certs.go
+++ b/command/healthcheck/pki_too_many_certs.go
@ -0,0 +1,101 @@
+package healthcheck
+
+import (
+	"fmt"
+
+	"github.com/hashicorp/go-secure-stdlib/parseutil"
+)
+
+type TooManyCerts struct {
+	Enabled            bool
+	UnsupportedVersion bool
+
+	CountCritical int
+	CountWarning  int
+
+	CertCounts int
+}
+
+func NewTooManyCertsCheck() Check {
+	return &TooManyCerts{}
+}
+
+func (h *TooManyCerts) Name() string {
+	return "too_many_certs"
+}
+
+func (h *TooManyCerts) IsEnabled() bool {
+	return h.Enabled
+}
+
+func (h *TooManyCerts) DefaultConfig() map[string]interface{} {
+	return map[string]interface{}{
+		"count_critical": 250000,
+		"count_warning":  50000,
+	}
+}
+
+func (h *TooManyCerts) LoadConfig(config map[string]interface{}) error {
+	value, err := parseutil.SafeParseIntRange(config["count_critical"], 1, 15000000)
+	if err != nil {
+		return fmt.Errorf("error parsing %v.count_critical: %w", h.Name(), err)
+	}
+	h.CountCritical = int(value)
+
+	value, err = parseutil.SafeParseIntRange(config["count_warning"], 1, 15000000)
+	if err != nil {
+		return fmt.Errorf("error parsing %v.count_warning: %w", h.Name(), err)
+	}
+	h.CountWarning = int(value)
+
+	h.Enabled, err = parseutil.ParseBool(config["enabled"])
+	if err != nil {
+		return fmt.Errorf("error parsing %v.enabled: %w", h.Name(), err)
+	}
+
+	return nil
+}
+
+func (h *TooManyCerts) FetchResources(e *Executor) error {
+	exit, leavesRet, _, err := pkiFetchLeaves(e, func() {
+		h.UnsupportedVersion = true
+	})
+	if exit {
+		return err
+	}
+
+	h.CertCounts = leavesRet.ParsedCache["count"].(int)
+
+	return nil
+}
+
+func (h *TooManyCerts) Evaluate(e *Executor) (results []*Result, err error) {
+	if h.UnsupportedVersion {
+		// Shouldn't happen; /certs has been around forever.
+		ret := Result{
+			Status:   ResultInvalidVersion,
+			Endpoint: "/{{mount}}/certs",
+			Message:  "This health check requires Vault 1.11+ but an earlier version of Vault Server was contacted, preventing this health check from running.",
+		}
+		return []*Result{&ret}, nil
+	}
+
+	ret := Result{
+		Status:   ResultOK,
+		Endpoint: "/{{mount}}/certs",
+		Message:  "This mount has an OK number of stored certificates.",
+	}
+
+	baseMsg := "This PKI mount has %v outstanding stored certificates; consider using no_store=false on roles, running tidy operations periodically, and using shorter certificate lifetimes to reduce the storage pressure on this mount."
+	if h.CertCounts >= h.CountCritical {
+		ret.Status = ResultCritical
+		ret.Message = fmt.Sprintf(baseMsg, h.CertCounts)
+	} else if h.CertCounts >= h.CountWarning {
+		ret.Status = ResultWarning
+		ret.Message = fmt.Sprintf(baseMsg, h.CertCounts)
+	}
+
+	results = append(results, &ret)
+
+	return
+}
--- a/command/pki_health_check.go
+++ b/command/pki_health_check.go
@ -202,6 +202,9 @@ func (c *PKIHealthCheckCommand) Run(args []string) int {
 	executor.AddCheck(healthcheck.NewRoleAllowsLocalhostCheck())
 	executor.AddCheck(healthcheck.NewRoleAllowsGlobWildcardsCheck())
 	executor.AddCheck(healthcheck.NewRoleNoStoreFalseCheck())
+	executor.AddCheck(healthcheck.NewEnableAutoTidyCheck())
+	executor.AddCheck(healthcheck.NewTidyLastRunCheck())
+	executor.AddCheck(healthcheck.NewTooManyCertsCheck())
 	if c.flagDefaultDisabled {
 		executor.DefaultEnabled = false
 	}