Add automatic tidy of expired issuers (#17823)

* Add automatic tidy of expired issuers

To aid PKI users like Consul, which periodically rotate intermediates,
and provided a little more consistency with older versions of Vault
which would silently (and dangerously!) replace the configured CA on
root/intermediate generation, we introduce an automatic tidy of expired
issuers.

This includes a longer safety buffer (1 year) and logging of the
relevant issuer information prior to deletion (certificate contents, key
ID, and issuer ID/name) to allow admins to recover this value if
desired, or perform further cleanup of keys.

From my PoV, removal of the issuer is thus a relatively safe operation
compared to keys (which I do not feel comfortable removing) as they can
always be re-imported if desired. Additionally, this is an opt-in tidy
operation, not enabled by default. Lastly, most major performance
penalties comes with lots of issuers within the mount, not as much
large numbers of keys (as only new issuer creation/import operations are
affected, unlike LIST /issuers which is a public, unauthenticated
endpoint).

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

* Add changelog entry

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

* Add test for tidy

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

* Add docs on tidy of issuers

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

* Restructure logging

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

* Add missing fields to expected tidy output

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>
This commit is contained in:
Alexander Scheel 2022-11-10 10:53:26 -05:00 committed by GitHub
parent d04b91ed0a
commit 5a2ee4ca7a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 315 additions and 33 deletions

View File

@ -260,9 +260,11 @@ const (
type tidyStatus struct {
// Parameters used to initiate the operation
safetyBuffer int
issuerSafetyBuffer int
tidyCertStore bool
tidyRevokedCerts bool
tidyRevokedAssocs bool
tidyExpiredIssuers bool
pauseDuration string
// Status

View File

@ -3936,9 +3936,11 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
}
expectedData := map[string]interface{}{
"safety_buffer": json.Number("1"),
"issuer_safety_buffer": json.Number("31536000"),
"tidy_cert_store": true,
"tidy_revoked_certs": true,
"tidy_revoked_cert_issuer_associations": false,
"tidy_expired_issuers": false,
"pause_duration": "0s",
"state": "Finished",
"error": nil,

View File

@ -454,6 +454,13 @@ on revocation entries. This helps increase the performance of CRL building
and OCSP responses.`,
}
fields["tidy_expired_issuers"] = &framework.FieldSchema{
Type: framework.TypeBool,
Description: `Set to true to automatically remove expired issuers
past the issuer_safety_buffer. No keys will be removed as part of this
operation.`,
}
fields["safety_buffer"] = &framework.FieldSchema{
Type: framework.TypeDurationSecond,
Description: `The amount of extra time that must have passed
@ -463,6 +470,15 @@ Defaults to 72 hours.`,
Default: int(defaultTidyConfig.SafetyBuffer / time.Second), // TypeDurationSecond currently requires defaults to be int
}
fields["issuer_safety_buffer"] = &framework.FieldSchema{
Type: framework.TypeDurationSecond,
Description: `The amount of extra time that must have passed
beyond issuer's expiration before it is removed
from the backend storage.
Defaults to 8760 hours (1 year).`,
Default: int(defaultTidyConfig.IssuerSafetyBuffer / time.Second), // TypeDurationSecond currently requires defaults to be int
}
fields["pause_duration"] = &framework.FieldSchema{
Type: framework.TypeString,
Description: `The amount of time to wait between processing

View File

@ -13,6 +13,7 @@ import (
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/vault/sdk/framework"
"github.com/hashicorp/vault/sdk/helper/consts"
"github.com/hashicorp/vault/sdk/logical"
)
@ -24,7 +25,9 @@ type tidyConfig struct {
CertStore bool `json:"tidy_cert_store"`
RevokedCerts bool `json:"tidy_revoked_certs"`
IssuerAssocs bool `json:"tidy_revoked_cert_issuer_associations"`
ExpiredIssuers bool `json:"tidy_expired_issuers"`
SafetyBuffer time.Duration `json:"safety_buffer"`
IssuerSafetyBuffer time.Duration `json:"issuer_safety_buffer"`
PauseDuration time.Duration `json:"pause_duration"`
}
@ -34,7 +37,9 @@ var defaultTidyConfig = tidyConfig{
CertStore: false,
RevokedCerts: false,
IssuerAssocs: false,
ExpiredIssuers: false,
SafetyBuffer: 72 * time.Hour,
IssuerSafetyBuffer: 365 * 24 * time.Hour,
PauseDuration: 0 * time.Second,
}
@ -116,6 +121,8 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
tidyCertStore := d.Get("tidy_cert_store").(bool)
tidyRevokedCerts := d.Get("tidy_revoked_certs").(bool) || d.Get("tidy_revocation_list").(bool)
tidyRevokedAssocs := d.Get("tidy_revoked_cert_issuer_associations").(bool)
tidyExpiredIssuers := d.Get("tidy_expired_issuers").(bool)
issuerSafetyBuffer := d.Get("issuer_safety_buffer").(int)
pauseDurationStr := d.Get("pause_duration").(string)
pauseDuration := 0 * time.Second
@ -123,6 +130,10 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
return logical.ErrorResponse("safety_buffer must be greater than zero"), nil
}
if issuerSafetyBuffer < 1 {
return logical.ErrorResponse("issuer_safety_buffer must be greater than zero"), nil
}
if pauseDurationStr != "" {
var err error
pauseDuration, err = time.ParseDuration(pauseDurationStr)
@ -136,6 +147,7 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
}
bufferDuration := time.Duration(safetyBuffer) * time.Second
issuerBufferDuration := time.Duration(issuerSafetyBuffer) * time.Second
// Manual run with constructed configuration.
config := &tidyConfig{
@ -144,7 +156,9 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
CertStore: tidyCertStore,
RevokedCerts: tidyRevokedCerts,
IssuerAssocs: tidyRevokedAssocs,
ExpiredIssuers: tidyExpiredIssuers,
SafetyBuffer: bufferDuration,
IssuerSafetyBuffer: issuerBufferDuration,
PauseDuration: pauseDuration,
}
@ -170,8 +184,8 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
b.startTidyOperation(req, config)
resp := &logical.Response{}
if !tidyCertStore && !tidyRevokedCerts && !tidyRevokedAssocs {
resp.AddWarning("No targets to tidy; specify tidy_cert_store=true or tidy_revoked_certs=true or tidy_revoked_cert_issuer_associations=true to start a tidy operation.")
if !tidyCertStore && !tidyRevokedCerts && !tidyRevokedAssocs && !tidyExpiredIssuers {
resp.AddWarning("No targets to tidy; specify tidy_cert_store=true or tidy_revoked_certs=true or tidy_revoked_cert_issuer_associations=true or tidy_expired_issuers=true to start a tidy operation.")
} else {
resp.AddWarning("Tidy operation successfully started. Any information from the operation will be printed to Vault's server logs.")
}
@ -209,6 +223,12 @@ func (b *backend) startTidyOperation(req *logical.Request, config *tidyConfig) {
}
}
if config.ExpiredIssuers {
if err := b.doTidyExpiredIssuers(ctx, req, logger, config); err != nil {
return err
}
}
return nil
}
@ -440,6 +460,111 @@ func (b *backend) doTidyRevocationStore(ctx context.Context, req *logical.Reques
return nil
}
func (b *backend) doTidyExpiredIssuers(ctx context.Context, req *logical.Request, logger hclog.Logger, config *tidyConfig) error {
if b.System().ReplicationState().HasState(consts.ReplicationDRSecondary|consts.ReplicationPerformanceStandby) ||
(!b.System().LocalMount() && b.System().ReplicationState().HasState(consts.ReplicationPerformanceSecondary)) {
b.Logger().Debug("skipping expired issuer tidy as we're not on the primary or secondary with a local mount")
return nil
}
// Short-circuit to avoid having to deal with the legacy mounts. While we
// could handle this case and remove these issuers, its somewhat
// unexpected behavior and we'd prefer to finish the migration first.
if b.useLegacyBundleCaStorage() {
return nil
}
b.issuersLock.Lock()
defer b.issuersLock.Unlock()
// Fetch and parse our issuers so we have their expiration date.
sc := b.makeStorageContext(ctx, req.Storage)
issuerIDCertMap, err := fetchIssuerMapForRevocationChecking(sc)
if err != nil {
return err
}
// Fetch the issuer config to find the default; we don't want to remove
// the current active issuer automatically.
iConfig, err := sc.getIssuersConfig()
if err != nil {
return err
}
// We want certificates which have expired before this date by a given
// safety buffer. So we subtract the buffer from now, and anything which
// has expired before our after buffer can be tidied, and anything that
// expired after this buffer must be kept.
now := time.Now()
afterBuffer := now.Add(-1 * config.IssuerSafetyBuffer)
rebuildChainsAndCRL := false
for issuer, cert := range issuerIDCertMap {
if cert.NotAfter.After(afterBuffer) {
continue
}
entry, err := sc.fetchIssuerById(issuer)
if err != nil {
return nil
}
// This issuer's certificate has expired. We explicitly persist the
// key, but log both the certificate and the keyId to the
// informational logs so an admin can recover the removed cert if
// necessary or remove the key (and know which cert it belonged to),
// if desired.
msg := "[Tidy on mount: %v] Issuer %v has expired by %v and is being removed."
idAndName := fmt.Sprintf("[id:%v/name:%v]", entry.ID, entry.Name)
msg = fmt.Sprintf(msg, b.backendUUID, idAndName, config.IssuerSafetyBuffer)
// Before we log, check if we're the default. While this is late, and
// after we read it from storage, we have more info here to tell the
// user that their default has expired AND has passed the safety
// buffer.
if iConfig.DefaultIssuerId == issuer {
msg = "[Tidy on mount: %v] Issuer %v has expired and would be removed via tidy, but won't be, as it is currently the default issuer."
msg = fmt.Sprintf(msg, b.backendUUID, idAndName)
b.Logger().Warn(msg)
continue
}
// Log the above message..
b.Logger().Info(msg, "serial_number", entry.SerialNumber, "key_id", entry.KeyID, "certificate", entry.Certificate)
wasDefault, err := sc.deleteIssuer(issuer)
if err != nil {
b.Logger().Error(fmt.Sprintf("failed to remove %v: %v", idAndName, err))
return err
}
if wasDefault {
b.Logger().Warn(fmt.Sprintf("expired issuer %v was default; it is strongly encouraged to choose a new default issuer for backwards compatibility", idAndName))
}
rebuildChainsAndCRL = true
}
if rebuildChainsAndCRL {
// When issuers are removed, there's a chance chains change as a
// result; remove them.
if err := sc.rebuildIssuersChains(nil); err != nil {
return err
}
// Removal of issuers is generally a good reason to rebuild the CRL,
// even if auto-rebuild is enabled.
b.revokeStorageLock.Lock()
defer b.revokeStorageLock.Unlock()
if err := b.crlBuilder.rebuild(ctx, b, req, false); err != nil {
return err
}
}
return nil
}
func (b *backend) pathTidyCancelWrite(ctx context.Context, req *logical.Request, d *framework.FieldData) (*logical.Response, error) {
if atomic.LoadUint32(b.tidyCASGuard) == 0 {
resp := &logical.Response{}
@ -470,9 +595,11 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f
resp := &logical.Response{
Data: map[string]interface{}{
"safety_buffer": nil,
"issuer_safety_buffer": nil,
"tidy_cert_store": nil,
"tidy_revoked_certs": nil,
"tidy_revoked_cert_issuer_associations": nil,
"tidy_expired_issuers": nil,
"pause_duration": nil,
"state": "Inactive",
"error": nil,
@ -492,9 +619,11 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f
}
resp.Data["safety_buffer"] = b.tidyStatus.safetyBuffer
resp.Data["issuer_safety_buffer"] = b.tidyStatus.issuerSafetyBuffer
resp.Data["tidy_cert_store"] = b.tidyStatus.tidyCertStore
resp.Data["tidy_revoked_certs"] = b.tidyStatus.tidyRevokedCerts
resp.Data["tidy_revoked_cert_issuer_associations"] = b.tidyStatus.tidyRevokedAssocs
resp.Data["tidy_expired_issuers"] = b.tidyStatus.tidyExpiredIssuers
resp.Data["pause_duration"] = b.tidyStatus.pauseDuration
resp.Data["time_started"] = b.tidyStatus.timeStarted
resp.Data["message"] = b.tidyStatus.message
@ -547,7 +676,9 @@ func (b *backend) pathConfigAutoTidyRead(ctx context.Context, req *logical.Reque
"tidy_cert_store": config.CertStore,
"tidy_revoked_certs": config.RevokedCerts,
"tidy_revoked_cert_issuer_associations": config.IssuerAssocs,
"tidy_expired_issuers": config.ExpiredIssuers,
"safety_buffer": int(config.SafetyBuffer / time.Second),
"issuer_safety_buffer": int(config.IssuerSafetyBuffer / time.Second),
"pause_duration": config.PauseDuration.String(),
},
}, nil
@ -614,9 +745,11 @@ func (b *backend) tidyStatusStart(config *tidyConfig) {
b.tidyStatus = &tidyStatus{
safetyBuffer: int(config.SafetyBuffer / time.Second),
issuerSafetyBuffer: int(config.IssuerSafetyBuffer / time.Second),
tidyCertStore: config.CertStore,
tidyRevokedCerts: config.RevokedCerts,
tidyRevokedAssocs: config.IssuerAssocs,
tidyExpiredIssuers: config.ExpiredIssuers,
pauseDuration: config.PauseDuration.String(),
state: tidyStatusStarted,

View File

@ -264,3 +264,110 @@ func TestTidyCancellation(t *testing.T) {
t.Fatalf("expected to only process at most 3 more certificates, but processed (%v >>> %v) certs", nowMany, howMany)
}
}
func TestTidyIssuers(t *testing.T) {
t.Parallel()
b, s := createBackendWithStorage(t)
// Create a root that expires quickly and one valid for longer.
_, err := CBWrite(b, s, "root/generate/internal", map[string]interface{}{
"common_name": "root1 example.com",
"issuer_name": "root-expired",
"ttl": "1s",
"key_type": "ec",
})
require.NoError(t, err)
_, err = CBWrite(b, s, "root/generate/internal", map[string]interface{}{
"common_name": "root2 example.com",
"issuer_name": "root-valid",
"ttl": "60m",
"key_type": "rsa",
})
require.NoError(t, err)
// Sleep long enough to expire the root.
time.Sleep(2 * time.Second)
// First tidy run shouldn't remove anything; too long of safety buffer.
_, err = CBWrite(b, s, "tidy", map[string]interface{}{
"tidy_expired_issuers": true,
"issuer_safety_buffer": "60m",
})
require.NoError(t, err)
// Wait for tidy to finish.
time.Sleep(2 * time.Second)
// Expired issuer should exist.
resp, err := CBRead(b, s, "issuer/root-expired")
requireSuccessNonNilResponse(t, resp, err, "expired should still be present")
resp, err = CBRead(b, s, "issuer/root-valid")
requireSuccessNonNilResponse(t, resp, err, "valid should still be present")
// Second tidy run with shorter safety buffer shouldn't remove the
// expired one, as it should be the default issuer.
_, err = CBWrite(b, s, "tidy", map[string]interface{}{
"tidy_expired_issuers": true,
"issuer_safety_buffer": "1s",
})
require.NoError(t, err)
// Wait for tidy to finish.
time.Sleep(2 * time.Second)
// Expired issuer should still exist.
resp, err = CBRead(b, s, "issuer/root-expired")
requireSuccessNonNilResponse(t, resp, err, "expired should still be present")
resp, err = CBRead(b, s, "issuer/root-valid")
requireSuccessNonNilResponse(t, resp, err, "valid should still be present")
// Update the default issuer.
_, err = CBWrite(b, s, "config/issuers", map[string]interface{}{
"default": "root-valid",
})
require.NoError(t, err)
// Third tidy run should remove the expired one.
_, err = CBWrite(b, s, "tidy", map[string]interface{}{
"tidy_expired_issuers": true,
"issuer_safety_buffer": "1s",
})
require.NoError(t, err)
// Wait for tidy to finish.
time.Sleep(2 * time.Second)
// Valid issuer should exist still; other should be removed.
resp, err = CBRead(b, s, "issuer/root-expired")
require.Error(t, err)
require.Nil(t, resp)
resp, err = CBRead(b, s, "issuer/root-valid")
requireSuccessNonNilResponse(t, resp, err, "valid should still be present")
// Finally, one more tidy should cause no changes.
_, err = CBWrite(b, s, "tidy", map[string]interface{}{
"tidy_expired_issuers": true,
"issuer_safety_buffer": "1s",
})
require.NoError(t, err)
// Wait for tidy to finish.
time.Sleep(2 * time.Second)
// Valid issuer should exist still; other should be removed.
resp, err = CBRead(b, s, "issuer/root-expired")
require.Error(t, err)
require.Nil(t, resp)
resp, err = CBRead(b, s, "issuer/root-valid")
requireSuccessNonNilResponse(t, resp, err, "valid should still be present")
// Ensure we have safety buffer and expired issuers set correctly.
statusResp, err := CBRead(b, s, "tidy-status")
require.NoError(t, err)
require.NotNil(t, statusResp)
require.NotNil(t, statusResp.Data)
require.Equal(t, statusResp.Data["issuer_safety_buffer"], 1)
require.Equal(t, statusResp.Data["tidy_expired_issuers"], true)
}

View File

@ -1222,6 +1222,10 @@ func (sc *storageContext) getAutoTidyConfig() (*tidyConfig, error) {
return nil, err
}
if result.IssuerSafetyBuffer == 0 {
result.IssuerSafetyBuffer = defaultTidyConfig.IssuerSafetyBuffer
}
return &result, nil
}

3
changelog/17823.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:improvement
secrets/pki: Allow tidying of expired issuer certificates.
```

View File

@ -3325,6 +3325,15 @@ expiration time.
performance of OCSP and CRL building, by shifting work to a tidy operation
instead.
- `tidy_expired_issuers` `(bool: false)` - Set to true to automatically remove
expired issuers after the `issuer_safety_buffer` duration has elapsed. We
log the issuer certificate on removal to allow recovery; no keys are removed
during this process.
~> Note: When the default issuer expires and is tidied, a new default must be
chosen manually by the operator; this process will not select a new one
automatically.
- `safety_buffer` `(string: "")` - Specifies a duration using [duration format strings](/docs/concepts/duration-format)
used as a safety buffer to ensure certificates are not expunged prematurely; as an example, this can keep
certificates from being removed from the CRL that, due to clock skew, might
@ -3332,6 +3341,10 @@ expiration time.
the time must be after the expiration time of the certificate (according to
the local clock) plus the duration of `safety_buffer`. Defaults to `72h`.
- `issuer_safety_buffer` `(string: "")` - Specifies a duration that issuers
should be kept for, past their `NotAfter` validity period. Defaults to
365 days as hours (`8760h`).
- `pause_duration` `(string: "0s")` - Specifies the duration to pause
between tidying individual certificates. This releases the revocation
lock and allows other operations to continue while tidy is running.
@ -3340,6 +3353,8 @@ expiration time.
between reading, parsing, and updates on-disk cert entries will be
increased, decreasing resource utilization.
Does not affect `tidy_expired_issuers`.
~> Note: Using too long of a `pause_duration` can result in tidy operations
not concluding during this lifetime! Using too short of a pause duration
(but non-zero) can lead to lock contention. Use [tidy's cancellation](#cancel-tidy)