Add auto-tidy, last-tidy, and leaf cert health checks (#17901)
* Add enable_auto_tidy health check Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Add tidy_last_run health check Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Add too_many_certs health check Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Add tidy, CRL, cert count checks to CLI Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Cache stored leaf cert count Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Correctly parse last run Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>
This commit is contained in:
parent
9543067ffe
commit
8461f096e2
|
@ -198,6 +198,7 @@ func pkiFetchLeaves(e *Executor, versionError func()) (bool, *PathFetch, []strin
|
|||
leaves = append(leaves, rawSerial.(string))
|
||||
}
|
||||
leavesRet.ParsedCache["leaves"] = leaves
|
||||
leavesRet.ParsedCache["count"] = len(leaves)
|
||||
}
|
||||
|
||||
return false, leavesRet, leavesRet.ParsedCache["leaves"].([]string), nil
|
||||
|
|
|
@ -0,0 +1,185 @@
|
|||
package healthcheck
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/vault/sdk/logical"
|
||||
|
||||
"github.com/hashicorp/go-secure-stdlib/parseutil"
|
||||
)
|
||||
|
||||
type EnableAutoTidy struct {
|
||||
Enabled bool
|
||||
UnsupportedVersion bool
|
||||
|
||||
IntervalDurationCritical time.Duration
|
||||
IntervalDurationWarning time.Duration
|
||||
PauseDurationCritical time.Duration
|
||||
PauseDurationWarning time.Duration
|
||||
|
||||
TidyConfig *PathFetch
|
||||
}
|
||||
|
||||
func NewEnableAutoTidyCheck() Check {
|
||||
return &EnableAutoTidy{}
|
||||
}
|
||||
|
||||
func (h *EnableAutoTidy) Name() string {
|
||||
return "enable_auto_tidy"
|
||||
}
|
||||
|
||||
func (h *EnableAutoTidy) IsEnabled() bool {
|
||||
return h.Enabled
|
||||
}
|
||||
|
||||
func (h *EnableAutoTidy) DefaultConfig() map[string]interface{} {
|
||||
return map[string]interface{}{
|
||||
"interval_duration_critical": "7d",
|
||||
"interval_duration_warning": "2d",
|
||||
"pause_duration_critical": "1s",
|
||||
"pause_duration_warning": "200ms",
|
||||
}
|
||||
}
|
||||
|
||||
func (h *EnableAutoTidy) fromConfig(config map[string]interface{}, param string) (time.Duration, error) {
|
||||
value, err := parseutil.ParseDurationSecond(config[param])
|
||||
if err != nil {
|
||||
return time.Duration(0), fmt.Errorf("failed to parse parameter %v.%v=%v: %w", h.Name(), param, config[param], err)
|
||||
}
|
||||
|
||||
return value, nil
|
||||
}
|
||||
|
||||
func (h *EnableAutoTidy) LoadConfig(config map[string]interface{}) error {
|
||||
var err error
|
||||
|
||||
h.IntervalDurationCritical, err = h.fromConfig(config, "interval_duration_critical")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
h.IntervalDurationWarning, err = h.fromConfig(config, "interval_duration_warning")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
h.PauseDurationCritical, err = h.fromConfig(config, "pause_duration_critical")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
h.PauseDurationWarning, err = h.fromConfig(config, "pause_duration_warning")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
enabled, err := parseutil.ParseBool(config["enabled"])
|
||||
if err != nil {
|
||||
return fmt.Errorf("error parsing %v.enabled: %w", h.Name(), err)
|
||||
}
|
||||
h.Enabled = enabled
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *EnableAutoTidy) FetchResources(e *Executor) error {
|
||||
var err error
|
||||
h.TidyConfig, err = e.FetchIfNotFetched(logical.ReadOperation, "/{{mount}}/config/auto-tidy")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if h.TidyConfig.IsUnsupportedPathError() {
|
||||
h.UnsupportedVersion = true
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *EnableAutoTidy) Evaluate(e *Executor) (results []*Result, err error) {
|
||||
if h.UnsupportedVersion {
|
||||
ret := Result{
|
||||
Status: ResultInvalidVersion,
|
||||
Endpoint: "/{{mount}}/config/auto-tidy",
|
||||
Message: "This health check requires Vault 1.12+, but an earlier version of Vault Server was contacted, preventing this health check from running.",
|
||||
}
|
||||
return []*Result{&ret}, nil
|
||||
}
|
||||
|
||||
if h.TidyConfig == nil {
|
||||
return
|
||||
}
|
||||
|
||||
if h.TidyConfig.IsSecretPermissionsError() {
|
||||
ret := Result{
|
||||
Status: ResultInsufficientPermissions,
|
||||
Endpoint: "/{{mount}}/config/auto-tidy",
|
||||
Message: "This prevents the health check from functioning at all, as it cannot .",
|
||||
}
|
||||
|
||||
if e.Client.Token() == "" {
|
||||
ret.Message = "No token available so unable read authenticated auto-tidy configuration for this mount. " + ret.Message
|
||||
} else {
|
||||
ret.Message = "This token lacks permission to read the auto-tidy configuration for this mount. " + ret.Message
|
||||
}
|
||||
|
||||
return []*Result{&ret}, nil
|
||||
}
|
||||
|
||||
isEnabled := h.TidyConfig.Secret.Data["enabled"].(bool)
|
||||
intervalDuration, err := parseutil.ParseDurationSecond(h.TidyConfig.Secret.Data["interval_duration"])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error parsing API response from server for interval_duration: %w", err)
|
||||
}
|
||||
|
||||
pauseDuration, err := parseutil.ParseDurationSecond(h.TidyConfig.Secret.Data["pause_duration"])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error parsing API response from server for pause_duration: %w", err)
|
||||
}
|
||||
|
||||
if !isEnabled {
|
||||
ret := Result{
|
||||
Status: ResultInformational,
|
||||
Endpoint: "/{{mount}}/config/auto-tidy",
|
||||
Message: "Auto-tidy is currently disabled; consider enabling auto-tidy to execute tidy operations periodically. This helps the health and performance of a mount.",
|
||||
}
|
||||
results = append(results, &ret)
|
||||
} else {
|
||||
baseMsg := "Auto-tidy is configured with too long of a value for %v (%v); this could impact performance as tidies run too infrequently or take too long to execute."
|
||||
|
||||
if intervalDuration >= h.IntervalDurationCritical {
|
||||
ret := Result{
|
||||
Status: ResultCritical,
|
||||
Endpoint: "/{{mount}}/config/auto-tidy",
|
||||
Message: fmt.Sprintf(baseMsg, "interval_duration", intervalDuration),
|
||||
}
|
||||
results = append(results, &ret)
|
||||
} else if intervalDuration >= h.IntervalDurationWarning {
|
||||
ret := Result{
|
||||
Status: ResultWarning,
|
||||
Endpoint: "/{{mount}}/config/auto-tidy",
|
||||
Message: fmt.Sprintf(baseMsg, "interval_duration", intervalDuration),
|
||||
}
|
||||
results = append(results, &ret)
|
||||
}
|
||||
|
||||
if pauseDuration >= h.PauseDurationCritical {
|
||||
ret := Result{
|
||||
Status: ResultCritical,
|
||||
Endpoint: "/{{mount}}/config/auto-tidy",
|
||||
Message: fmt.Sprintf(baseMsg, "pause_duration", pauseDuration),
|
||||
}
|
||||
results = append(results, &ret)
|
||||
} else if pauseDuration >= h.PauseDurationWarning {
|
||||
ret := Result{
|
||||
Status: ResultWarning,
|
||||
Endpoint: "/{{mount}}/config/auto-tidy",
|
||||
Message: fmt.Sprintf(baseMsg, "pause_duration", pauseDuration),
|
||||
}
|
||||
results = append(results, &ret)
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
|
@ -0,0 +1,124 @@
|
|||
package healthcheck
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/vault/sdk/logical"
|
||||
|
||||
"github.com/hashicorp/go-secure-stdlib/parseutil"
|
||||
)
|
||||
|
||||
type TidyLastRun struct {
|
||||
Enabled bool
|
||||
UnsupportedVersion bool
|
||||
|
||||
LastRunCritical time.Duration
|
||||
LastRunWarning time.Duration
|
||||
|
||||
TidyStatus *PathFetch
|
||||
}
|
||||
|
||||
func NewTidyLastRunCheck() Check {
|
||||
return &TidyLastRun{}
|
||||
}
|
||||
|
||||
func (h *TidyLastRun) Name() string {
|
||||
return "tidy_last_run"
|
||||
}
|
||||
|
||||
func (h *TidyLastRun) IsEnabled() bool {
|
||||
return h.Enabled
|
||||
}
|
||||
|
||||
func (h *TidyLastRun) DefaultConfig() map[string]interface{} {
|
||||
return map[string]interface{}{
|
||||
"last_run_critical": "7d",
|
||||
"last_run_warning": "2d",
|
||||
}
|
||||
}
|
||||
|
||||
func (h *TidyLastRun) LoadConfig(config map[string]interface{}) error {
|
||||
var err error
|
||||
h.LastRunCritical, err = parseutil.ParseDurationSecond(config["last_run_critical"])
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse parameter %v.%v=%v: %w", h.Name(), "last_run_critical", config["last_run_critical"], err)
|
||||
}
|
||||
|
||||
h.LastRunWarning, err = parseutil.ParseDurationSecond(config["last_run_warning"])
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse parameter %v.%v=%v: %w", h.Name(), "last_run_warning", config["last_run_warning"], err)
|
||||
}
|
||||
|
||||
enabled, err := parseutil.ParseBool(config["enabled"])
|
||||
if err != nil {
|
||||
return fmt.Errorf("error parsing %v.enabled: %w", h.Name(), err)
|
||||
}
|
||||
h.Enabled = enabled
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *TidyLastRun) FetchResources(e *Executor) error {
|
||||
var err error
|
||||
|
||||
h.TidyStatus, err = e.FetchIfNotFetched(logical.ReadOperation, "/{{mount}}/tidy-status")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to fetch mount's tidy-status value: %v", err)
|
||||
}
|
||||
|
||||
if h.TidyStatus.IsUnsupportedPathError() {
|
||||
h.UnsupportedVersion = true
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *TidyLastRun) Evaluate(e *Executor) (results []*Result, err error) {
|
||||
if h.UnsupportedVersion {
|
||||
// Shouldn't happen; roles have been around forever.
|
||||
ret := Result{
|
||||
Status: ResultInvalidVersion,
|
||||
Endpoint: "/{{mount}}/tidy-status",
|
||||
Message: "This health check requires Vault 1.10+ but an earlier version of Vault Server was contacted, preventing this health check from running.",
|
||||
}
|
||||
return []*Result{&ret}, nil
|
||||
}
|
||||
|
||||
baseMsg := "Tidy hasn't run in the last %v; this can point to problems with the mount's auto-tidy configuration or an external tidy executor; this can impact PKI's and Vault's performance if not run regularly."
|
||||
|
||||
ret := Result{
|
||||
Status: ResultOK,
|
||||
Endpoint: "/{{mount}}/tidy-status",
|
||||
Message: "Tidy has run recently on this mount.",
|
||||
}
|
||||
|
||||
if h.TidyStatus.Secret != nil && h.TidyStatus.Secret.Data != nil {
|
||||
when := h.TidyStatus.Secret.Data["time_finished"]
|
||||
if when == nil {
|
||||
ret.Status = ResultCritical
|
||||
ret.Message = "Tidy hasn't run since this mount was created; this can point to problems with the mount's auto-tidy configuration or an external tidy executor; this can impact PKI's and Vault's performance if not run regularly. It is suggested to enable auto-tidy on this mount."
|
||||
} else {
|
||||
now := time.Now()
|
||||
lastRunCritical := now.Add(-1 * h.LastRunCritical)
|
||||
lastRunWarning := now.Add(-1 * h.LastRunWarning)
|
||||
|
||||
whenT, err := parseutil.ParseAbsoluteTime(when)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error parsing time value (%v): %w", when, err)
|
||||
}
|
||||
|
||||
if whenT.Before(lastRunCritical) {
|
||||
ret.Status = ResultCritical
|
||||
ret.Message = fmt.Sprintf(baseMsg, h.LastRunCritical)
|
||||
} else if whenT.Before(lastRunWarning) {
|
||||
ret.Status = ResultWarning
|
||||
ret.Message = fmt.Sprintf(baseMsg, h.LastRunWarning)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
results = append(results, &ret)
|
||||
|
||||
return
|
||||
}
|
|
@ -0,0 +1,101 @@
|
|||
package healthcheck
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/hashicorp/go-secure-stdlib/parseutil"
|
||||
)
|
||||
|
||||
type TooManyCerts struct {
|
||||
Enabled bool
|
||||
UnsupportedVersion bool
|
||||
|
||||
CountCritical int
|
||||
CountWarning int
|
||||
|
||||
CertCounts int
|
||||
}
|
||||
|
||||
func NewTooManyCertsCheck() Check {
|
||||
return &TooManyCerts{}
|
||||
}
|
||||
|
||||
func (h *TooManyCerts) Name() string {
|
||||
return "too_many_certs"
|
||||
}
|
||||
|
||||
func (h *TooManyCerts) IsEnabled() bool {
|
||||
return h.Enabled
|
||||
}
|
||||
|
||||
func (h *TooManyCerts) DefaultConfig() map[string]interface{} {
|
||||
return map[string]interface{}{
|
||||
"count_critical": 250000,
|
||||
"count_warning": 50000,
|
||||
}
|
||||
}
|
||||
|
||||
func (h *TooManyCerts) LoadConfig(config map[string]interface{}) error {
|
||||
value, err := parseutil.SafeParseIntRange(config["count_critical"], 1, 15000000)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error parsing %v.count_critical: %w", h.Name(), err)
|
||||
}
|
||||
h.CountCritical = int(value)
|
||||
|
||||
value, err = parseutil.SafeParseIntRange(config["count_warning"], 1, 15000000)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error parsing %v.count_warning: %w", h.Name(), err)
|
||||
}
|
||||
h.CountWarning = int(value)
|
||||
|
||||
h.Enabled, err = parseutil.ParseBool(config["enabled"])
|
||||
if err != nil {
|
||||
return fmt.Errorf("error parsing %v.enabled: %w", h.Name(), err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *TooManyCerts) FetchResources(e *Executor) error {
|
||||
exit, leavesRet, _, err := pkiFetchLeaves(e, func() {
|
||||
h.UnsupportedVersion = true
|
||||
})
|
||||
if exit {
|
||||
return err
|
||||
}
|
||||
|
||||
h.CertCounts = leavesRet.ParsedCache["count"].(int)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *TooManyCerts) Evaluate(e *Executor) (results []*Result, err error) {
|
||||
if h.UnsupportedVersion {
|
||||
// Shouldn't happen; /certs has been around forever.
|
||||
ret := Result{
|
||||
Status: ResultInvalidVersion,
|
||||
Endpoint: "/{{mount}}/certs",
|
||||
Message: "This health check requires Vault 1.11+ but an earlier version of Vault Server was contacted, preventing this health check from running.",
|
||||
}
|
||||
return []*Result{&ret}, nil
|
||||
}
|
||||
|
||||
ret := Result{
|
||||
Status: ResultOK,
|
||||
Endpoint: "/{{mount}}/certs",
|
||||
Message: "This mount has an OK number of stored certificates.",
|
||||
}
|
||||
|
||||
baseMsg := "This PKI mount has %v outstanding stored certificates; consider using no_store=false on roles, running tidy operations periodically, and using shorter certificate lifetimes to reduce the storage pressure on this mount."
|
||||
if h.CertCounts >= h.CountCritical {
|
||||
ret.Status = ResultCritical
|
||||
ret.Message = fmt.Sprintf(baseMsg, h.CertCounts)
|
||||
} else if h.CertCounts >= h.CountWarning {
|
||||
ret.Status = ResultWarning
|
||||
ret.Message = fmt.Sprintf(baseMsg, h.CertCounts)
|
||||
}
|
||||
|
||||
results = append(results, &ret)
|
||||
|
||||
return
|
||||
}
|
|
@ -202,6 +202,9 @@ func (c *PKIHealthCheckCommand) Run(args []string) int {
|
|||
executor.AddCheck(healthcheck.NewRoleAllowsLocalhostCheck())
|
||||
executor.AddCheck(healthcheck.NewRoleAllowsGlobWildcardsCheck())
|
||||
executor.AddCheck(healthcheck.NewRoleNoStoreFalseCheck())
|
||||
executor.AddCheck(healthcheck.NewEnableAutoTidyCheck())
|
||||
executor.AddCheck(healthcheck.NewTidyLastRunCheck())
|
||||
executor.AddCheck(healthcheck.NewTooManyCertsCheck())
|
||||
if c.flagDefaultDisabled {
|
||||
executor.DefaultEnabled = false
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue