Add ability to perform automatic tidy operations (#16900)
* Add ability to perform automatic tidy operations This enables the PKI secrets engine to allow tidy to be started periodically by the engine itself, avoiding the need for interaction. This operation is disabled by default (to avoid load on clusters which don't need tidy to be run) but can be enabled. In particular, a default tidy configuration is written (via /config/auto-tidy) which mirrors the options passed to /tidy. Two additional parameters, enabled and interval, are accepted, allowing auto-tidy to be enabled or disabled and controlling the interval (between successful tidy runs) to attempt auto-tidy. Notably, a manual execution of tidy will delay additional auto-tidy operations. Status is reported via the existing /tidy-status endpoint. Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Add changelog entry Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Add documentation on auto-tidy Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Add tests for auto-tidy Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Prevent race during parallel testing We modified the RollbackManager's execution window to allow more faithful testing of the periodicFunc. However, the TestAutoRebuild and the new TestAutoTidy would then race against each other for modifying the period and creating their clusters (before resetting to the old value). This changeset adds a lock around this, preventing the races. Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Use tidyStatusLock to gate lastTidy time This prevents a data race between the periodic func and the execution of the running tidy. Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Add read lock around tidyStatus gauges When reading from tidyStatus for computing gauges, since the underlying values aren't atomics, we really should be gating these with a read lock around the status access. Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>
This commit is contained in:
parent
99213c2616
commit
a5fafd8163
|
@ -132,6 +132,7 @@ func Backend(conf *logical.BackendConfig) *backend {
|
|||
pathRevokeWithKey(&b),
|
||||
pathTidy(&b),
|
||||
pathTidyStatus(&b),
|
||||
pathConfigAutoTidy(&b),
|
||||
|
||||
// Issuer APIs
|
||||
pathListIssuers(&b),
|
||||
|
@ -191,6 +192,9 @@ func Backend(conf *logical.BackendConfig) *backend {
|
|||
|
||||
b.crlBuilder = newCRLBuilder()
|
||||
|
||||
// Delay the first tidy until after we've started up.
|
||||
b.lastTidy = time.Now()
|
||||
|
||||
return &b
|
||||
}
|
||||
|
||||
|
@ -204,6 +208,7 @@ type backend struct {
|
|||
|
||||
tidyStatusLock sync.RWMutex
|
||||
tidyStatus *tidyStatus
|
||||
lastTidy time.Time
|
||||
|
||||
pkiStorageVersion atomic.Value
|
||||
crlBuilder *crlBuilder
|
||||
|
@ -405,34 +410,105 @@ func (b *backend) invalidate(ctx context.Context, key string) {
|
|||
}
|
||||
|
||||
func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) error {
|
||||
// First attempt to reload the CRL configuration.
|
||||
sc := b.makeStorageContext(ctx, request.Storage)
|
||||
if err := b.crlBuilder.reloadConfigIfRequired(sc); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// As we're (below) modifying the backing storage, we need to ensure
|
||||
// we're not on a standby/secondary node.
|
||||
if b.System().ReplicationState().HasState(consts.ReplicationPerformanceStandby) ||
|
||||
b.System().ReplicationState().HasState(consts.ReplicationDRSecondary) {
|
||||
doCRL := func() error {
|
||||
// First attempt to reload the CRL configuration.
|
||||
if err := b.crlBuilder.reloadConfigIfRequired(sc); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// As we're (below) modifying the backing storage, we need to ensure
|
||||
// we're not on a standby/secondary node.
|
||||
if b.System().ReplicationState().HasState(consts.ReplicationPerformanceStandby) ||
|
||||
b.System().ReplicationState().HasState(consts.ReplicationDRSecondary) {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Check if we're set to auto rebuild and a CRL is set to expire.
|
||||
if err := b.crlBuilder.checkForAutoRebuild(sc); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Then attempt to rebuild the CRLs if required.
|
||||
if err := b.crlBuilder.rebuildIfForced(ctx, b, request); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// If a delta CRL was rebuilt above as part of the complete CRL rebuild,
|
||||
// this will be a no-op. However, if we do need to rebuild delta CRLs,
|
||||
// this would cause us to do so.
|
||||
if err := b.crlBuilder.rebuildDeltaCRLsIfForced(sc); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Check if we're set to auto rebuild and a CRL is set to expire.
|
||||
if err := b.crlBuilder.checkForAutoRebuild(sc); err != nil {
|
||||
return err
|
||||
doAutoTidy := func() error {
|
||||
// As we're (below) modifying the backing storage, we need to ensure
|
||||
// we're not on a standby/secondary node.
|
||||
if b.System().ReplicationState().HasState(consts.ReplicationPerformanceStandby) ||
|
||||
b.System().ReplicationState().HasState(consts.ReplicationDRSecondary) {
|
||||
return nil
|
||||
}
|
||||
|
||||
config, err := sc.getAutoTidyConfig()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if !config.Enabled || config.Interval <= 0*time.Second {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Check if we should run another tidy...
|
||||
now := time.Now()
|
||||
b.tidyStatusLock.RLock()
|
||||
nextOp := b.lastTidy.Add(config.Interval)
|
||||
b.tidyStatusLock.RUnlock()
|
||||
if now.Before(nextOp) {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Ensure a tidy isn't already running... If it is, we'll trigger
|
||||
// again when the running one finishes.
|
||||
if !atomic.CompareAndSwapUint32(b.tidyCASGuard, 0, 1) {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Prevent ourselves from starting another tidy operation while
|
||||
// this one is still running. This operation runs in the background
|
||||
// and has a separate error reporting mechanism.
|
||||
b.tidyStatusLock.Lock()
|
||||
b.lastTidy = now
|
||||
b.tidyStatusLock.Unlock()
|
||||
|
||||
// Because the request from the parent storage will be cleared at
|
||||
// some point (and potentially reused) -- due to tidy executing in
|
||||
// a background goroutine -- we need to copy the storage entry off
|
||||
// of the backend instead.
|
||||
backendReq := &logical.Request{
|
||||
Storage: b.storage,
|
||||
}
|
||||
|
||||
b.startTidyOperation(backendReq, config)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Then attempt to rebuild the CRLs if required.
|
||||
if err := b.crlBuilder.rebuildIfForced(ctx, b, request); err != nil {
|
||||
return err
|
||||
crlErr := doCRL()
|
||||
tidyErr := doAutoTidy()
|
||||
|
||||
if crlErr != nil && tidyErr != nil {
|
||||
return fmt.Errorf("Error building CRLs:\n - %v\n\nError running auto-tidy:\n - %v\n", crlErr, tidyErr)
|
||||
}
|
||||
|
||||
// If a delta CRL was rebuilt above as part of the complete CRL rebuild,
|
||||
// this will be a no-op. However, if we do need to rebuild delta CRLs,
|
||||
// this would cause us to do so.
|
||||
if err := b.crlBuilder.rebuildDeltaCRLsIfForced(sc); err != nil {
|
||||
return err
|
||||
if crlErr != nil {
|
||||
return fmt.Errorf("Error building CRLs:\n - %v\n", crlErr)
|
||||
}
|
||||
|
||||
if tidyErr != nil {
|
||||
return fmt.Errorf("Error running auto-tidy:\n - %v\n", tidyErr)
|
||||
}
|
||||
|
||||
// Check if the CRL was invalidated due to issuer swap and update
|
||||
|
|
|
@ -876,14 +876,11 @@ func TestAutoRebuild(t *testing.T) {
|
|||
// storage without barrier encryption.
|
||||
EnableRaw: true,
|
||||
}
|
||||
oldPeriod := vault.SetRollbackPeriodForTesting(newPeriod)
|
||||
cluster := vault.NewTestCluster(t, coreConfig, &vault.TestClusterOptions{
|
||||
cluster := vault.CreateTestClusterWithRollbackPeriod(t, newPeriod, coreConfig, &vault.TestClusterOptions{
|
||||
HandlerFunc: vaulthttp.Handler,
|
||||
})
|
||||
cluster.Start()
|
||||
defer cluster.Cleanup()
|
||||
client := cluster.Cores[0].Client
|
||||
vault.SetRollbackPeriodForTesting(oldPeriod)
|
||||
|
||||
// Mount PKI
|
||||
err := client.Sys().Mount("pki", &api.MountInput{
|
||||
|
|
|
@ -416,3 +416,41 @@ to the key.`,
|
|||
}
|
||||
return fields
|
||||
}
|
||||
|
||||
func addTidyFields(fields map[string]*framework.FieldSchema) map[string]*framework.FieldSchema {
|
||||
fields["tidy_cert_store"] = &framework.FieldSchema{
|
||||
Type: framework.TypeBool,
|
||||
Description: `Set to true to enable tidying up
|
||||
the certificate store`,
|
||||
}
|
||||
|
||||
fields["tidy_revocation_list"] = &framework.FieldSchema{
|
||||
Type: framework.TypeBool,
|
||||
Description: `Deprecated; synonym for 'tidy_revoked_certs`,
|
||||
}
|
||||
|
||||
fields["tidy_revoked_certs"] = &framework.FieldSchema{
|
||||
Type: framework.TypeBool,
|
||||
Description: `Set to true to expire all revoked
|
||||
and expired certificates, removing them both from the CRL and from storage. The
|
||||
CRL will be rotated if this causes any values to be removed.`,
|
||||
}
|
||||
|
||||
fields["tidy_revoked_cert_issuer_associations"] = &framework.FieldSchema{
|
||||
Type: framework.TypeBool,
|
||||
Description: `Set to true to validate issuer associations
|
||||
on revocation entries. This helps increase the performance of CRL building
|
||||
and OCSP responses.`,
|
||||
}
|
||||
|
||||
fields["safety_buffer"] = &framework.FieldSchema{
|
||||
Type: framework.TypeDurationSecond,
|
||||
Description: `The amount of extra time that must have passed
|
||||
beyond certificate expiration before it is removed
|
||||
from the backend storage and/or revocation list.
|
||||
Defaults to 72 hours.`,
|
||||
Default: 259200, // 72h, but TypeDurationSecond currently requires defaults to be int
|
||||
}
|
||||
|
||||
return fields
|
||||
}
|
||||
|
|
|
@ -17,58 +17,33 @@ import (
|
|||
)
|
||||
|
||||
type tidyConfig struct {
|
||||
Enabled bool `json:"enabled"`
|
||||
Interval time.Duration `json:"interval_duration"`
|
||||
CertStore bool `json:"tidy_cert_store"`
|
||||
RevokedCerts bool `json:"tidy_revoked_certs"`
|
||||
IssuerAssocs bool `json:"tidy_revoked_cert_issuer_associations"`
|
||||
SafetyBuffer time.Duration `json:"safety_buffer"`
|
||||
}
|
||||
|
||||
var defaultTidyConfig = tidyConfig{
|
||||
Enabled: false,
|
||||
Interval: 12 * time.Hour,
|
||||
CertStore: false,
|
||||
RevokedCerts: false,
|
||||
IssuerAssocs: false,
|
||||
SafetyBuffer: 259200 * time.Second,
|
||||
}
|
||||
|
||||
func pathTidy(b *backend) *framework.Path {
|
||||
return &framework.Path{
|
||||
Pattern: "tidy$",
|
||||
Fields: map[string]*framework.FieldSchema{
|
||||
"tidy_cert_store": {
|
||||
Type: framework.TypeBool,
|
||||
Description: `Set to true to enable tidying up
|
||||
the certificate store`,
|
||||
},
|
||||
|
||||
"tidy_revocation_list": {
|
||||
Type: framework.TypeBool,
|
||||
Description: `Deprecated; synonym for 'tidy_revoked_certs`,
|
||||
},
|
||||
|
||||
"tidy_revoked_certs": {
|
||||
Type: framework.TypeBool,
|
||||
Description: `Set to true to expire all revoked
|
||||
and expired certificates, removing them both from the CRL and from storage. The
|
||||
CRL will be rotated if this causes any values to be removed.`,
|
||||
},
|
||||
|
||||
"tidy_revoked_cert_issuer_associations": {
|
||||
Type: framework.TypeBool,
|
||||
Description: `Set to true to validate issuer associations
|
||||
on revocation entries. This helps increase the performance of CRL building
|
||||
and OCSP responses.`,
|
||||
},
|
||||
|
||||
"safety_buffer": {
|
||||
Type: framework.TypeDurationSecond,
|
||||
Description: `The amount of extra time that must have passed
|
||||
beyond certificate expiration before it is removed
|
||||
from the backend storage and/or revocation list.
|
||||
Defaults to 72 hours.`,
|
||||
Default: 259200, // 72h, but TypeDurationSecond currently requires defaults to be int
|
||||
},
|
||||
},
|
||||
|
||||
Fields: addTidyFields(map[string]*framework.FieldSchema{}),
|
||||
Operations: map[logical.Operation]framework.OperationHandler{
|
||||
logical.UpdateOperation: &framework.PathOperation{
|
||||
Callback: b.pathTidyWrite,
|
||||
ForwardPerformanceStandby: true,
|
||||
},
|
||||
},
|
||||
|
||||
HelpSynopsis: pathTidyHelpSyn,
|
||||
HelpDescription: pathTidyHelpDesc,
|
||||
}
|
||||
|
@ -88,6 +63,36 @@ func pathTidyStatus(b *backend) *framework.Path {
|
|||
}
|
||||
}
|
||||
|
||||
func pathConfigAutoTidy(b *backend) *framework.Path {
|
||||
return &framework.Path{
|
||||
Pattern: "config/auto-tidy",
|
||||
Fields: addTidyFields(map[string]*framework.FieldSchema{
|
||||
"enabled": {
|
||||
Type: framework.TypeBool,
|
||||
Description: `Set to true to enable automatic tidy operations.`,
|
||||
},
|
||||
"interval_duration": {
|
||||
Type: framework.TypeDurationSecond,
|
||||
Description: `Interval at which to run an auto-tidy operation. This is the time between tidy invocations (after one finishes to the start of the next). Running a manual tidy will reset this duration.`,
|
||||
Default: 43200, // 32h, but TypeDurationSecond currently requires the default to be an int.
|
||||
},
|
||||
}),
|
||||
Operations: map[logical.Operation]framework.OperationHandler{
|
||||
logical.ReadOperation: &framework.PathOperation{
|
||||
Callback: b.pathConfigAutoTidyRead,
|
||||
},
|
||||
logical.UpdateOperation: &framework.PathOperation{
|
||||
Callback: b.pathConfigAutoTidyWrite,
|
||||
// Read more about why these flags are set in backend.go.
|
||||
ForwardPerformanceStandby: true,
|
||||
ForwardPerformanceSecondary: true,
|
||||
},
|
||||
},
|
||||
HelpSynopsis: pathConfigAutoTidySyn,
|
||||
HelpDescription: pathConfigAutoTidyDesc,
|
||||
}
|
||||
}
|
||||
|
||||
func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *framework.FieldData) (*logical.Response, error) {
|
||||
safetyBuffer := d.Get("safety_buffer").(int)
|
||||
tidyCertStore := d.Get("tidy_cert_store").(bool)
|
||||
|
@ -100,7 +105,10 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
|
|||
|
||||
bufferDuration := time.Duration(safetyBuffer) * time.Second
|
||||
|
||||
// Manual run with constructed configuration.
|
||||
config := &tidyConfig{
|
||||
Enabled: true,
|
||||
Interval: 0 * time.Second,
|
||||
CertStore: tidyCertStore,
|
||||
RevokedCerts: tidyRevokedCerts,
|
||||
IssuerAssocs: tidyRevokedAssocs,
|
||||
|
@ -119,6 +127,13 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
|
|||
Storage: req.Storage,
|
||||
}
|
||||
|
||||
// Mark the last tidy operation as relatively recent, to ensure we don't
|
||||
// try to trigger the periodic function.
|
||||
b.tidyStatusLock.Lock()
|
||||
b.lastTidy = time.Now()
|
||||
b.tidyStatusLock.Unlock()
|
||||
|
||||
// Kick off the actual tidy.
|
||||
b.startTidyOperation(req, config)
|
||||
|
||||
resp := &logical.Response{}
|
||||
|
@ -163,6 +178,13 @@ func (b *backend) startTidyOperation(req *logical.Request, config *tidyConfig) {
|
|||
b.tidyStatusStop(err)
|
||||
} else {
|
||||
b.tidyStatusStop(nil)
|
||||
|
||||
// Since the tidy operation finished without an error, we don't
|
||||
// really want to start another tidy right away (if the interval
|
||||
// is too short). So mark the last tidy as now.
|
||||
b.tidyStatusLock.Lock()
|
||||
b.lastTidy = time.Now()
|
||||
b.tidyStatusLock.Unlock()
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
@ -215,7 +237,9 @@ func (b *backend) doTidyCertStore(ctx context.Context, req *logical.Request, log
|
|||
}
|
||||
}
|
||||
|
||||
b.tidyStatusLock.RLock()
|
||||
metrics.SetGauge([]string{"secrets", "pki", "tidy", "cert_store_total_entries_remaining"}, float32(uint(serialCount)-b.tidyStatus.certStoreDeletedCount))
|
||||
b.tidyStatusLock.RUnlock()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
@ -329,9 +353,11 @@ func (b *backend) doTidyRevocationStore(ctx context.Context, req *logical.Reques
|
|||
}
|
||||
}
|
||||
|
||||
b.tidyStatusLock.RLock()
|
||||
metrics.SetGauge([]string{"secrets", "pki", "tidy", "revoked_cert_total_entries_remaining"}, float32(uint(revokedSerialsCount)-b.tidyStatus.revokedCertDeletedCount))
|
||||
metrics.SetGauge([]string{"secrets", "pki", "tidy", "revoked_cert_entries_incorrect_issuers"}, float32(b.tidyStatus.missingIssuerCertCount))
|
||||
metrics.SetGauge([]string{"secrets", "pki", "tidy", "revoked_cert_entries_fixed_issuers"}, float32(fixedIssuers))
|
||||
b.tidyStatusLock.RUnlock()
|
||||
|
||||
if rebuildCRL {
|
||||
// Expired certificates isn't generally an important
|
||||
|
@ -411,6 +437,69 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f
|
|||
return resp, nil
|
||||
}
|
||||
|
||||
func (b *backend) pathConfigAutoTidyRead(ctx context.Context, req *logical.Request, data *framework.FieldData) (*logical.Response, error) {
|
||||
sc := b.makeStorageContext(ctx, req.Storage)
|
||||
config, err := sc.getAutoTidyConfig()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &logical.Response{
|
||||
Data: map[string]interface{}{
|
||||
"enabled": config.Enabled,
|
||||
"interval_duration": int(config.Interval / time.Second),
|
||||
"tidy_cert_store": config.CertStore,
|
||||
"tidy_revoked_certs": config.RevokedCerts,
|
||||
"tidy_revoked_cert_issuer_associations": config.IssuerAssocs,
|
||||
"safety_buffer": int(config.SafetyBuffer / time.Second),
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (b *backend) pathConfigAutoTidyWrite(ctx context.Context, req *logical.Request, d *framework.FieldData) (*logical.Response, error) {
|
||||
sc := b.makeStorageContext(ctx, req.Storage)
|
||||
config, err := sc.getAutoTidyConfig()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if enabledRaw, ok := d.GetOk("enabled"); ok {
|
||||
config.Enabled = enabledRaw.(bool)
|
||||
}
|
||||
|
||||
if intervalRaw, ok := d.GetOk("interval_duration"); ok {
|
||||
config.Interval = time.Duration(intervalRaw.(int)) * time.Second
|
||||
if config.Interval < 0 {
|
||||
return logical.ErrorResponse(fmt.Sprintf("given interval_duration must be greater than or equal to zero seconds; got: %v", intervalRaw)), nil
|
||||
}
|
||||
}
|
||||
|
||||
if certStoreRaw, ok := d.GetOk("tidy_cert_store"); ok {
|
||||
config.CertStore = certStoreRaw.(bool)
|
||||
}
|
||||
|
||||
if revokedCertsRaw, ok := d.GetOk("tidy_revoked_certs"); ok {
|
||||
config.RevokedCerts = revokedCertsRaw.(bool)
|
||||
}
|
||||
|
||||
if issuerAssocRaw, ok := d.GetOk("tidy_revoked_cert_issuer_associations"); ok {
|
||||
config.IssuerAssocs = issuerAssocRaw.(bool)
|
||||
}
|
||||
|
||||
if safetyBufferRaw, ok := d.GetOk("safety_buffer"); ok {
|
||||
config.SafetyBuffer = time.Duration(safetyBufferRaw.(int)) * time.Second
|
||||
if config.SafetyBuffer < 1*time.Second {
|
||||
return logical.ErrorResponse(fmt.Sprintf("given safety_buffer must be greater than zero seconds; got: %v", safetyBufferRaw)), nil
|
||||
}
|
||||
}
|
||||
|
||||
if config.Enabled && !(config.CertStore || config.RevokedCerts || config.IssuerAssocs) {
|
||||
return logical.ErrorResponse(fmt.Sprintf("Auto-tidy enabled but no tidy operations were requested. Enable at least one tidy operation to be run (tidy_cert_store / tidy_revoked_certs / tidy_revoked_cert_issuer_associations).")), nil
|
||||
}
|
||||
|
||||
return nil, sc.writeAutoTidyConfig(config)
|
||||
}
|
||||
|
||||
func (b *backend) tidyStatusStart(config *tidyConfig) {
|
||||
b.tidyStatusLock.Lock()
|
||||
defer b.tidyStatusLock.Unlock()
|
||||
|
@ -529,3 +618,7 @@ The result includes the following fields:
|
|||
* 'revoked_cert_deleted_count': The number of revoked certificate entries deleted
|
||||
* 'missing_issuer_cert_count': The number of revoked certificates which were missing a valid issuer reference
|
||||
`
|
||||
|
||||
const pathConfigAutoTidySyn = ``
|
||||
|
||||
const pathConfigAutoTidyDesc = ``
|
||||
|
|
|
@ -0,0 +1,157 @@
|
|||
package pki
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/vault/api"
|
||||
vaulthttp "github.com/hashicorp/vault/http"
|
||||
"github.com/hashicorp/vault/sdk/logical"
|
||||
"github.com/hashicorp/vault/vault"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestAutoTidy(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
// While we'd like to reduce this duration, we need to wait until
|
||||
// the rollback manager timer ticks. With the new helper, we can
|
||||
// modify the rollback manager timer period directly, allowing us
|
||||
// to shorten the total test time significantly.
|
||||
//
|
||||
// We set the delta CRL time to ensure it executes prior to the
|
||||
// main CRL rebuild, and the new CRL doesn't rebuild until after
|
||||
// we're done.
|
||||
newPeriod := 1 * time.Second
|
||||
|
||||
// This test requires the periodicFunc to trigger, which requires we stand
|
||||
// up a full test cluster.
|
||||
coreConfig := &vault.CoreConfig{
|
||||
LogicalBackends: map[string]logical.Factory{
|
||||
"pki": Factory,
|
||||
},
|
||||
// See notes below about usage of /sys/raw for reading cluster
|
||||
// storage without barrier encryption.
|
||||
EnableRaw: true,
|
||||
}
|
||||
cluster := vault.CreateTestClusterWithRollbackPeriod(t, newPeriod, coreConfig, &vault.TestClusterOptions{
|
||||
HandlerFunc: vaulthttp.Handler,
|
||||
})
|
||||
defer cluster.Cleanup()
|
||||
client := cluster.Cores[0].Client
|
||||
|
||||
// Mount PKI
|
||||
err := client.Sys().Mount("pki", &api.MountInput{
|
||||
Type: "pki",
|
||||
Config: api.MountConfigInput{
|
||||
DefaultLeaseTTL: "10m",
|
||||
MaxLeaseTTL: "60m",
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Generate root.
|
||||
resp, err := client.Logical().Write("pki/root/generate/internal", map[string]interface{}{
|
||||
"ttl": "40h",
|
||||
"common_name": "Root X1",
|
||||
"key_type": "ec",
|
||||
})
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, resp)
|
||||
require.NotEmpty(t, resp.Data)
|
||||
require.NotEmpty(t, resp.Data["issuer_id"])
|
||||
|
||||
// Run tidy so status is not empty when we run it later...
|
||||
_, err = client.Logical().Write("pki/tidy", map[string]interface{}{
|
||||
"tidy_revoked_certs": true,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Setup a testing role.
|
||||
_, err = client.Logical().Write("pki/roles/local-testing", map[string]interface{}{
|
||||
"allow_any_name": true,
|
||||
"enforce_hostnames": false,
|
||||
"key_type": "ec",
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Write the auto-tidy config.
|
||||
_, err = client.Logical().Write("pki/config/auto-tidy", map[string]interface{}{
|
||||
"enabled": true,
|
||||
"interval_duration": "1s",
|
||||
"tidy_cert_store": true,
|
||||
"tidy_revoked_certs": true,
|
||||
"safety_buffer": "1s",
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Issue a cert and revoke it.
|
||||
resp, err = client.Logical().Write("pki/issue/local-testing", map[string]interface{}{
|
||||
"common_name": "example.com",
|
||||
"ttl": "10s",
|
||||
})
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, resp)
|
||||
require.NotNil(t, resp.Data)
|
||||
require.NotEmpty(t, resp.Data["serial_number"])
|
||||
require.NotEmpty(t, resp.Data["certificate"])
|
||||
leafSerial := resp.Data["serial_number"].(string)
|
||||
leafCert := parseCert(t, resp.Data["certificate"].(string))
|
||||
|
||||
_, err = client.Logical().Write("pki/revoke", map[string]interface{}{
|
||||
"serial_number": leafSerial,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Cert should still exist.
|
||||
resp, err = client.Logical().Read("pki/cert/" + leafSerial)
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, resp)
|
||||
require.NotNil(t, resp.Data)
|
||||
require.NotEmpty(t, resp.Data["certificate"])
|
||||
|
||||
// Wait for cert to expire and the safety buffer to elapse.
|
||||
time.Sleep(leafCert.NotAfter.Sub(time.Now()) + 3*time.Second)
|
||||
|
||||
// Wait for auto-tidy to run afterwards.
|
||||
var foundTidyRunning string
|
||||
var foundTidyFinished bool
|
||||
timeoutChan := time.After(120 * time.Second)
|
||||
for {
|
||||
if foundTidyRunning != "" && foundTidyFinished {
|
||||
break
|
||||
}
|
||||
|
||||
select {
|
||||
case <-timeoutChan:
|
||||
t.Fatalf("expected auto-tidy to run (%v) and finish (%v) before 120 seconds elapsed", foundTidyRunning, foundTidyFinished)
|
||||
default:
|
||||
time.Sleep(250 * time.Millisecond)
|
||||
|
||||
resp, err = client.Logical().Read("pki/tidy-status")
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, resp)
|
||||
require.NotNil(t, resp.Data)
|
||||
require.NotEmpty(t, resp.Data["state"])
|
||||
require.NotEmpty(t, resp.Data["time_started"])
|
||||
state := resp.Data["state"].(string)
|
||||
started := resp.Data["time_started"].(string)
|
||||
t.Logf("Resp: %v", resp.Data)
|
||||
|
||||
// We want the _next_ tidy run after the cert expires. This
|
||||
// means if we're currently finished when we hit this the
|
||||
// first time, we want to wait for the next run.
|
||||
if foundTidyRunning == "" {
|
||||
foundTidyRunning = started
|
||||
} else if foundTidyRunning != started && !foundTidyFinished && state == "Finished" {
|
||||
foundTidyFinished = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Cert should no longer exist.
|
||||
resp, err = client.Logical().Read("pki/cert/" + leafSerial)
|
||||
require.Nil(t, err)
|
||||
require.Nil(t, resp)
|
||||
}
|
|
@ -28,6 +28,8 @@ const (
|
|||
deltaCRLPath = "delta-crl"
|
||||
deltaCRLPathSuffix = "-delta"
|
||||
|
||||
autoTidyConfigPath = "config/auto-tidy"
|
||||
|
||||
// Used as a quick sanity check for a reference id lookups...
|
||||
uuidLength = 36
|
||||
|
||||
|
@ -1158,3 +1160,31 @@ func (sc *storageContext) getRevocationConfig() (*crlConfig, error) {
|
|||
|
||||
return &result, nil
|
||||
}
|
||||
|
||||
func (sc *storageContext) getAutoTidyConfig() (*tidyConfig, error) {
|
||||
entry, err := sc.Storage.Get(sc.Context, autoTidyConfigPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var result tidyConfig
|
||||
if entry == nil {
|
||||
result = defaultTidyConfig
|
||||
return &result, nil
|
||||
}
|
||||
|
||||
if err = entry.DecodeJSON(&result); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &result, nil
|
||||
}
|
||||
|
||||
func (sc *storageContext) writeAutoTidyConfig(config *tidyConfig) error {
|
||||
entry, err := logical.StorageEntryJSON(autoTidyConfigPath, config)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return sc.Storage.Put(sc.Context, entry)
|
||||
}
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
```release-note:improvement
|
||||
secrets/pki: Add ability to periodically run tidy operations to remove expired certificates.
|
||||
```
|
|
@ -17,7 +17,7 @@ import (
|
|||
// rollbackPeriod is how often we attempt rollbacks for all the backends.
|
||||
//
|
||||
// This is turned into a variable to allow test to check behavior without
|
||||
// waiting the full minute. See SetRollbackPeriodForTesting(...).
|
||||
// waiting the full minute. See CreateTestClusterWithRollbackPeriod(...).
|
||||
var rollbackPeriod = time.Minute
|
||||
|
||||
// RollbackManager is responsible for performing rollbacks of partial
|
||||
|
|
|
@ -2398,13 +2398,29 @@ func RetryUntil(t testing.T, timeout time.Duration, f func() error) {
|
|||
t.Fatalf("did not complete before deadline, err: %v", err)
|
||||
}
|
||||
|
||||
// SetRollbackPeriodForTesting lets us modify the periodic func invocation
|
||||
// time period to some other value. Best practice is to set this, spin up
|
||||
// a test cluster and immediately reset the value back to the default, to
|
||||
// avoid impacting other tests too much. To that end, we return the original
|
||||
// value of that period.
|
||||
func SetRollbackPeriodForTesting(newPeriod time.Duration) time.Duration {
|
||||
// CreateTestClusterWithRollbackPeriod lets us modify the periodic func
|
||||
// invocation time period to some other value.
|
||||
//
|
||||
// Because multiple tests in the PKI mount use this helper, we've added
|
||||
// a lock around it and created the cluster immediately in this helper.
|
||||
// This ensures the tests don't race against each other.
|
||||
var rollbackPeriodLock sync.Mutex
|
||||
|
||||
func CreateTestClusterWithRollbackPeriod(t testing.T, newPeriod time.Duration, base *CoreConfig, opts *TestClusterOptions) *TestCluster {
|
||||
rollbackPeriodLock.Lock()
|
||||
defer rollbackPeriodLock.Unlock()
|
||||
|
||||
// Set the period
|
||||
oldPeriod := rollbackPeriod
|
||||
|
||||
// Create and start a new cluster.
|
||||
rollbackPeriod = newPeriod
|
||||
return oldPeriod
|
||||
cluster := NewTestCluster(t, base, opts)
|
||||
cluster.Start()
|
||||
|
||||
// Reset the period
|
||||
rollbackPeriod = oldPeriod
|
||||
|
||||
// Return the cluster.
|
||||
return cluster
|
||||
}
|
||||
|
|
|
@ -68,6 +68,7 @@ update your API calls accordingly.
|
|||
- [Set CRL Configuration](#set-crl-configuration)
|
||||
- [Rotate CRLs](#rotate-crls)
|
||||
- [Tidy](#tidy)
|
||||
- [Configure Automatic Tidy](#configure-automatic-tidy)
|
||||
- [Tidy Status](#tidy-status)
|
||||
- [Cluster Scalability](#cluster-scalability)
|
||||
- [Managed Key](#managed-keys) (Enterprise Only)
|
||||
|
@ -3162,6 +3163,9 @@ expiration time.
|
|||
| :----- | :---------- |
|
||||
| `POST` | `/pki/tidy` |
|
||||
|
||||
~> Note: it is encouraged to use the [automatic tidy capabilities](#configure-automatic-tidy)
|
||||
to ensure this gets run periodically.
|
||||
|
||||
#### Parameters
|
||||
|
||||
- `tidy_cert_store` `(bool: false)` - Specifies whether to tidy up the certificate
|
||||
|
@ -3204,6 +3208,66 @@ $ curl \
|
|||
http://127.0.0.1:8200/v1/pki/tidy
|
||||
```
|
||||
|
||||
### Configure Automatic Tidy
|
||||
|
||||
This endpoint allows configuring periodic tidy operations, using the tidy mechanism
|
||||
described above. Status is from automatically run tidies are still reported at the
|
||||
status endpoint described below.
|
||||
|
||||
| Method | Path |
|
||||
| :----- | :---------------------- |
|
||||
| `POST` | `/pki/config/auto-tidy` |
|
||||
|
||||
#### Parameters
|
||||
|
||||
- `enabled` `(bool: false)` - Specifies whether automatic tidy is enabled or not.
|
||||
|
||||
- `interval_duration` `(string: "")` - Specifies the duration between automatic tidy
|
||||
operations; note that this is from the end of one operation to the start of
|
||||
the next so the time of the operation itself does not need to be considered.
|
||||
|
||||
- `tidy_cert_store` `(bool: false)` - Specifies whether to tidy up the certificate
|
||||
store.
|
||||
|
||||
- `tidy_revoked_certs` `(bool: false)` - Set to true to remove all invalid and
|
||||
expired certificates from storage. A revoked storage entry is considered
|
||||
invalid if the entry is empty, or the value within the entry is empty. If a
|
||||
certificate is removed due to expiry, the entry will also be removed from the
|
||||
CRL, and the CRL will be rotated.
|
||||
|
||||
- `tidy_revoked_cert_issuer_associations` `(bool: false)` - Set to true to associate
|
||||
revoked certificates with their corresponding issuers; this improves the
|
||||
performance of OCSP and CRL building, by shifting work to a tidy operation
|
||||
instead.
|
||||
|
||||
- `safety_buffer` `(string: "")` - Specifies a duration using [duration format strings](/docs/concepts/duration-format)
|
||||
used as a safety buffer to ensure certificates are not expunged prematurely; as an example, this can keep
|
||||
certificates from being removed from the CRL that, due to clock skew, might
|
||||
still be considered valid on other hosts. For a certificate to be expunged,
|
||||
the time must be after the expiration time of the certificate (according to
|
||||
the local clock) plus the duration of `safety_buffer`. Defaults to `72h`.
|
||||
|
||||
|
||||
#### Sample Payload
|
||||
|
||||
```json
|
||||
{
|
||||
"enabled": true,
|
||||
"tidy_revoked_cert_issuer_associations": true,
|
||||
"safety_buffer": "24h"
|
||||
}
|
||||
```
|
||||
|
||||
#### Sample Request
|
||||
|
||||
```shell-session
|
||||
$ curl \
|
||||
--header "X-Vault-Token: ..." \
|
||||
--request POST \
|
||||
--data @payload.json \
|
||||
http://127.0.0.1:8200/v1/pki/config/auto-tidy
|
||||
```
|
||||
|
||||
### Tidy Status
|
||||
|
||||
This is a read only endpoint that returns information about the current tidy
|
||||
|
|
Loading…
Reference in New Issue