From 8461f096e2c48ba8805708a6bd322f9839ef2802 Mon Sep 17 00:00:00 2001 From: Alexander Scheel Date: Fri, 18 Nov 2022 11:04:58 -0500 Subject: [PATCH] Add auto-tidy, last-tidy, and leaf cert health checks (#17901) * Add enable_auto_tidy health check Signed-off-by: Alexander Scheel * Add tidy_last_run health check Signed-off-by: Alexander Scheel * Add too_many_certs health check Signed-off-by: Alexander Scheel * Add tidy, CRL, cert count checks to CLI Signed-off-by: Alexander Scheel * Cache stored leaf cert count Signed-off-by: Alexander Scheel * Correctly parse last run Signed-off-by: Alexander Scheel Signed-off-by: Alexander Scheel --- command/healthcheck/pki.go | 1 + command/healthcheck/pki_enable_auto_tidy.go | 185 ++++++++++++++++++++ command/healthcheck/pki_tidy_last_run.go | 124 +++++++++++++ command/healthcheck/pki_too_many_certs.go | 101 +++++++++++ command/pki_health_check.go | 3 + 5 files changed, 414 insertions(+) create mode 100644 command/healthcheck/pki_enable_auto_tidy.go create mode 100644 command/healthcheck/pki_tidy_last_run.go create mode 100644 command/healthcheck/pki_too_many_certs.go diff --git a/command/healthcheck/pki.go b/command/healthcheck/pki.go index 5fcfac219..b46f460a9 100644 --- a/command/healthcheck/pki.go +++ b/command/healthcheck/pki.go @@ -198,6 +198,7 @@ func pkiFetchLeaves(e *Executor, versionError func()) (bool, *PathFetch, []strin leaves = append(leaves, rawSerial.(string)) } leavesRet.ParsedCache["leaves"] = leaves + leavesRet.ParsedCache["count"] = len(leaves) } return false, leavesRet, leavesRet.ParsedCache["leaves"].([]string), nil diff --git a/command/healthcheck/pki_enable_auto_tidy.go b/command/healthcheck/pki_enable_auto_tidy.go new file mode 100644 index 000000000..1a083be81 --- /dev/null +++ b/command/healthcheck/pki_enable_auto_tidy.go @@ -0,0 +1,185 @@ +package healthcheck + +import ( + "fmt" + "time" + + "github.com/hashicorp/vault/sdk/logical" + + "github.com/hashicorp/go-secure-stdlib/parseutil" +) + +type EnableAutoTidy struct { + Enabled bool + UnsupportedVersion bool + + IntervalDurationCritical time.Duration + IntervalDurationWarning time.Duration + PauseDurationCritical time.Duration + PauseDurationWarning time.Duration + + TidyConfig *PathFetch +} + +func NewEnableAutoTidyCheck() Check { + return &EnableAutoTidy{} +} + +func (h *EnableAutoTidy) Name() string { + return "enable_auto_tidy" +} + +func (h *EnableAutoTidy) IsEnabled() bool { + return h.Enabled +} + +func (h *EnableAutoTidy) DefaultConfig() map[string]interface{} { + return map[string]interface{}{ + "interval_duration_critical": "7d", + "interval_duration_warning": "2d", + "pause_duration_critical": "1s", + "pause_duration_warning": "200ms", + } +} + +func (h *EnableAutoTidy) fromConfig(config map[string]interface{}, param string) (time.Duration, error) { + value, err := parseutil.ParseDurationSecond(config[param]) + if err != nil { + return time.Duration(0), fmt.Errorf("failed to parse parameter %v.%v=%v: %w", h.Name(), param, config[param], err) + } + + return value, nil +} + +func (h *EnableAutoTidy) LoadConfig(config map[string]interface{}) error { + var err error + + h.IntervalDurationCritical, err = h.fromConfig(config, "interval_duration_critical") + if err != nil { + return err + } + + h.IntervalDurationWarning, err = h.fromConfig(config, "interval_duration_warning") + if err != nil { + return err + } + + h.PauseDurationCritical, err = h.fromConfig(config, "pause_duration_critical") + if err != nil { + return err + } + + h.PauseDurationWarning, err = h.fromConfig(config, "pause_duration_warning") + if err != nil { + return err + } + + enabled, err := parseutil.ParseBool(config["enabled"]) + if err != nil { + return fmt.Errorf("error parsing %v.enabled: %w", h.Name(), err) + } + h.Enabled = enabled + + return nil +} + +func (h *EnableAutoTidy) FetchResources(e *Executor) error { + var err error + h.TidyConfig, err = e.FetchIfNotFetched(logical.ReadOperation, "/{{mount}}/config/auto-tidy") + if err != nil { + return err + } + + if h.TidyConfig.IsUnsupportedPathError() { + h.UnsupportedVersion = true + } + + return nil +} + +func (h *EnableAutoTidy) Evaluate(e *Executor) (results []*Result, err error) { + if h.UnsupportedVersion { + ret := Result{ + Status: ResultInvalidVersion, + Endpoint: "/{{mount}}/config/auto-tidy", + Message: "This health check requires Vault 1.12+, but an earlier version of Vault Server was contacted, preventing this health check from running.", + } + return []*Result{&ret}, nil + } + + if h.TidyConfig == nil { + return + } + + if h.TidyConfig.IsSecretPermissionsError() { + ret := Result{ + Status: ResultInsufficientPermissions, + Endpoint: "/{{mount}}/config/auto-tidy", + Message: "This prevents the health check from functioning at all, as it cannot .", + } + + if e.Client.Token() == "" { + ret.Message = "No token available so unable read authenticated auto-tidy configuration for this mount. " + ret.Message + } else { + ret.Message = "This token lacks permission to read the auto-tidy configuration for this mount. " + ret.Message + } + + return []*Result{&ret}, nil + } + + isEnabled := h.TidyConfig.Secret.Data["enabled"].(bool) + intervalDuration, err := parseutil.ParseDurationSecond(h.TidyConfig.Secret.Data["interval_duration"]) + if err != nil { + return nil, fmt.Errorf("error parsing API response from server for interval_duration: %w", err) + } + + pauseDuration, err := parseutil.ParseDurationSecond(h.TidyConfig.Secret.Data["pause_duration"]) + if err != nil { + return nil, fmt.Errorf("error parsing API response from server for pause_duration: %w", err) + } + + if !isEnabled { + ret := Result{ + Status: ResultInformational, + Endpoint: "/{{mount}}/config/auto-tidy", + Message: "Auto-tidy is currently disabled; consider enabling auto-tidy to execute tidy operations periodically. This helps the health and performance of a mount.", + } + results = append(results, &ret) + } else { + baseMsg := "Auto-tidy is configured with too long of a value for %v (%v); this could impact performance as tidies run too infrequently or take too long to execute." + + if intervalDuration >= h.IntervalDurationCritical { + ret := Result{ + Status: ResultCritical, + Endpoint: "/{{mount}}/config/auto-tidy", + Message: fmt.Sprintf(baseMsg, "interval_duration", intervalDuration), + } + results = append(results, &ret) + } else if intervalDuration >= h.IntervalDurationWarning { + ret := Result{ + Status: ResultWarning, + Endpoint: "/{{mount}}/config/auto-tidy", + Message: fmt.Sprintf(baseMsg, "interval_duration", intervalDuration), + } + results = append(results, &ret) + } + + if pauseDuration >= h.PauseDurationCritical { + ret := Result{ + Status: ResultCritical, + Endpoint: "/{{mount}}/config/auto-tidy", + Message: fmt.Sprintf(baseMsg, "pause_duration", pauseDuration), + } + results = append(results, &ret) + } else if pauseDuration >= h.PauseDurationWarning { + ret := Result{ + Status: ResultWarning, + Endpoint: "/{{mount}}/config/auto-tidy", + Message: fmt.Sprintf(baseMsg, "pause_duration", pauseDuration), + } + results = append(results, &ret) + } + } + + return +} diff --git a/command/healthcheck/pki_tidy_last_run.go b/command/healthcheck/pki_tidy_last_run.go new file mode 100644 index 000000000..3dd1df017 --- /dev/null +++ b/command/healthcheck/pki_tidy_last_run.go @@ -0,0 +1,124 @@ +package healthcheck + +import ( + "fmt" + "time" + + "github.com/hashicorp/vault/sdk/logical" + + "github.com/hashicorp/go-secure-stdlib/parseutil" +) + +type TidyLastRun struct { + Enabled bool + UnsupportedVersion bool + + LastRunCritical time.Duration + LastRunWarning time.Duration + + TidyStatus *PathFetch +} + +func NewTidyLastRunCheck() Check { + return &TidyLastRun{} +} + +func (h *TidyLastRun) Name() string { + return "tidy_last_run" +} + +func (h *TidyLastRun) IsEnabled() bool { + return h.Enabled +} + +func (h *TidyLastRun) DefaultConfig() map[string]interface{} { + return map[string]interface{}{ + "last_run_critical": "7d", + "last_run_warning": "2d", + } +} + +func (h *TidyLastRun) LoadConfig(config map[string]interface{}) error { + var err error + h.LastRunCritical, err = parseutil.ParseDurationSecond(config["last_run_critical"]) + if err != nil { + return fmt.Errorf("failed to parse parameter %v.%v=%v: %w", h.Name(), "last_run_critical", config["last_run_critical"], err) + } + + h.LastRunWarning, err = parseutil.ParseDurationSecond(config["last_run_warning"]) + if err != nil { + return fmt.Errorf("failed to parse parameter %v.%v=%v: %w", h.Name(), "last_run_warning", config["last_run_warning"], err) + } + + enabled, err := parseutil.ParseBool(config["enabled"]) + if err != nil { + return fmt.Errorf("error parsing %v.enabled: %w", h.Name(), err) + } + h.Enabled = enabled + + return nil +} + +func (h *TidyLastRun) FetchResources(e *Executor) error { + var err error + + h.TidyStatus, err = e.FetchIfNotFetched(logical.ReadOperation, "/{{mount}}/tidy-status") + if err != nil { + return fmt.Errorf("failed to fetch mount's tidy-status value: %v", err) + } + + if h.TidyStatus.IsUnsupportedPathError() { + h.UnsupportedVersion = true + } + + return nil +} + +func (h *TidyLastRun) Evaluate(e *Executor) (results []*Result, err error) { + if h.UnsupportedVersion { + // Shouldn't happen; roles have been around forever. + ret := Result{ + Status: ResultInvalidVersion, + Endpoint: "/{{mount}}/tidy-status", + Message: "This health check requires Vault 1.10+ but an earlier version of Vault Server was contacted, preventing this health check from running.", + } + return []*Result{&ret}, nil + } + + baseMsg := "Tidy hasn't run in the last %v; this can point to problems with the mount's auto-tidy configuration or an external tidy executor; this can impact PKI's and Vault's performance if not run regularly." + + ret := Result{ + Status: ResultOK, + Endpoint: "/{{mount}}/tidy-status", + Message: "Tidy has run recently on this mount.", + } + + if h.TidyStatus.Secret != nil && h.TidyStatus.Secret.Data != nil { + when := h.TidyStatus.Secret.Data["time_finished"] + if when == nil { + ret.Status = ResultCritical + ret.Message = "Tidy hasn't run since this mount was created; this can point to problems with the mount's auto-tidy configuration or an external tidy executor; this can impact PKI's and Vault's performance if not run regularly. It is suggested to enable auto-tidy on this mount." + } else { + now := time.Now() + lastRunCritical := now.Add(-1 * h.LastRunCritical) + lastRunWarning := now.Add(-1 * h.LastRunWarning) + + whenT, err := parseutil.ParseAbsoluteTime(when) + if err != nil { + return nil, fmt.Errorf("error parsing time value (%v): %w", when, err) + } + + if whenT.Before(lastRunCritical) { + ret.Status = ResultCritical + ret.Message = fmt.Sprintf(baseMsg, h.LastRunCritical) + } else if whenT.Before(lastRunWarning) { + ret.Status = ResultWarning + ret.Message = fmt.Sprintf(baseMsg, h.LastRunWarning) + } + } + } + + results = append(results, &ret) + + return +} diff --git a/command/healthcheck/pki_too_many_certs.go b/command/healthcheck/pki_too_many_certs.go new file mode 100644 index 000000000..4210ca581 --- /dev/null +++ b/command/healthcheck/pki_too_many_certs.go @@ -0,0 +1,101 @@ +package healthcheck + +import ( + "fmt" + + "github.com/hashicorp/go-secure-stdlib/parseutil" +) + +type TooManyCerts struct { + Enabled bool + UnsupportedVersion bool + + CountCritical int + CountWarning int + + CertCounts int +} + +func NewTooManyCertsCheck() Check { + return &TooManyCerts{} +} + +func (h *TooManyCerts) Name() string { + return "too_many_certs" +} + +func (h *TooManyCerts) IsEnabled() bool { + return h.Enabled +} + +func (h *TooManyCerts) DefaultConfig() map[string]interface{} { + return map[string]interface{}{ + "count_critical": 250000, + "count_warning": 50000, + } +} + +func (h *TooManyCerts) LoadConfig(config map[string]interface{}) error { + value, err := parseutil.SafeParseIntRange(config["count_critical"], 1, 15000000) + if err != nil { + return fmt.Errorf("error parsing %v.count_critical: %w", h.Name(), err) + } + h.CountCritical = int(value) + + value, err = parseutil.SafeParseIntRange(config["count_warning"], 1, 15000000) + if err != nil { + return fmt.Errorf("error parsing %v.count_warning: %w", h.Name(), err) + } + h.CountWarning = int(value) + + h.Enabled, err = parseutil.ParseBool(config["enabled"]) + if err != nil { + return fmt.Errorf("error parsing %v.enabled: %w", h.Name(), err) + } + + return nil +} + +func (h *TooManyCerts) FetchResources(e *Executor) error { + exit, leavesRet, _, err := pkiFetchLeaves(e, func() { + h.UnsupportedVersion = true + }) + if exit { + return err + } + + h.CertCounts = leavesRet.ParsedCache["count"].(int) + + return nil +} + +func (h *TooManyCerts) Evaluate(e *Executor) (results []*Result, err error) { + if h.UnsupportedVersion { + // Shouldn't happen; /certs has been around forever. + ret := Result{ + Status: ResultInvalidVersion, + Endpoint: "/{{mount}}/certs", + Message: "This health check requires Vault 1.11+ but an earlier version of Vault Server was contacted, preventing this health check from running.", + } + return []*Result{&ret}, nil + } + + ret := Result{ + Status: ResultOK, + Endpoint: "/{{mount}}/certs", + Message: "This mount has an OK number of stored certificates.", + } + + baseMsg := "This PKI mount has %v outstanding stored certificates; consider using no_store=false on roles, running tidy operations periodically, and using shorter certificate lifetimes to reduce the storage pressure on this mount." + if h.CertCounts >= h.CountCritical { + ret.Status = ResultCritical + ret.Message = fmt.Sprintf(baseMsg, h.CertCounts) + } else if h.CertCounts >= h.CountWarning { + ret.Status = ResultWarning + ret.Message = fmt.Sprintf(baseMsg, h.CertCounts) + } + + results = append(results, &ret) + + return +} diff --git a/command/pki_health_check.go b/command/pki_health_check.go index adf2f9b91..b1f604b68 100644 --- a/command/pki_health_check.go +++ b/command/pki_health_check.go @@ -202,6 +202,9 @@ func (c *PKIHealthCheckCommand) Run(args []string) int { executor.AddCheck(healthcheck.NewRoleAllowsLocalhostCheck()) executor.AddCheck(healthcheck.NewRoleAllowsGlobWildcardsCheck()) executor.AddCheck(healthcheck.NewRoleNoStoreFalseCheck()) + executor.AddCheck(healthcheck.NewEnableAutoTidyCheck()) + executor.AddCheck(healthcheck.NewTidyLastRunCheck()) + executor.AddCheck(healthcheck.NewTooManyCertsCheck()) if c.flagDefaultDisabled { executor.DefaultEnabled = false }