From 5a2ee4ca7a297f4858f236b1d3cdb58c215c9dd6 Mon Sep 17 00:00:00 2001 From: Alexander Scheel Date: Thu, 10 Nov 2022 10:53:26 -0500 Subject: [PATCH] Add automatic tidy of expired issuers (#17823) * Add automatic tidy of expired issuers To aid PKI users like Consul, which periodically rotate intermediates, and provided a little more consistency with older versions of Vault which would silently (and dangerously!) replace the configured CA on root/intermediate generation, we introduce an automatic tidy of expired issuers. This includes a longer safety buffer (1 year) and logging of the relevant issuer information prior to deletion (certificate contents, key ID, and issuer ID/name) to allow admins to recover this value if desired, or perform further cleanup of keys. From my PoV, removal of the issuer is thus a relatively safe operation compared to keys (which I do not feel comfortable removing) as they can always be re-imported if desired. Additionally, this is an opt-in tidy operation, not enabled by default. Lastly, most major performance penalties comes with lots of issuers within the mount, not as much large numbers of keys (as only new issuer creation/import operations are affected, unlike LIST /issuers which is a public, unauthenticated endpoint). Signed-off-by: Alexander Scheel * Add changelog entry Signed-off-by: Alexander Scheel * Add test for tidy Signed-off-by: Alexander Scheel * Add docs on tidy of issuers Signed-off-by: Alexander Scheel * Restructure logging Signed-off-by: Alexander Scheel * Add missing fields to expected tidy output Signed-off-by: Alexander Scheel Signed-off-by: Alexander Scheel --- builtin/logical/pki/backend.go | 12 +- builtin/logical/pki/backend_test.go | 2 + builtin/logical/pki/fields.go | 16 ++ builtin/logical/pki/path_tidy.go | 189 ++++++++++++++++++++---- builtin/logical/pki/path_tidy_test.go | 107 ++++++++++++++ builtin/logical/pki/storage.go | 4 + changelog/17823.txt | 3 + website/content/api-docs/secret/pki.mdx | 15 ++ 8 files changed, 315 insertions(+), 33 deletions(-) create mode 100644 changelog/17823.txt diff --git a/builtin/logical/pki/backend.go b/builtin/logical/pki/backend.go index ba3f093fd..c92dfa949 100644 --- a/builtin/logical/pki/backend.go +++ b/builtin/logical/pki/backend.go @@ -259,11 +259,13 @@ const ( type tidyStatus struct { // Parameters used to initiate the operation - safetyBuffer int - tidyCertStore bool - tidyRevokedCerts bool - tidyRevokedAssocs bool - pauseDuration string + safetyBuffer int + issuerSafetyBuffer int + tidyCertStore bool + tidyRevokedCerts bool + tidyRevokedAssocs bool + tidyExpiredIssuers bool + pauseDuration string // Status state tidyStatusState diff --git a/builtin/logical/pki/backend_test.go b/builtin/logical/pki/backend_test.go index 282d670a7..9b41a9cf7 100644 --- a/builtin/logical/pki/backend_test.go +++ b/builtin/logical/pki/backend_test.go @@ -3936,9 +3936,11 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) { } expectedData := map[string]interface{}{ "safety_buffer": json.Number("1"), + "issuer_safety_buffer": json.Number("31536000"), "tidy_cert_store": true, "tidy_revoked_certs": true, "tidy_revoked_cert_issuer_associations": false, + "tidy_expired_issuers": false, "pause_duration": "0s", "state": "Finished", "error": nil, diff --git a/builtin/logical/pki/fields.go b/builtin/logical/pki/fields.go index 497ea8893..78cdd67e8 100644 --- a/builtin/logical/pki/fields.go +++ b/builtin/logical/pki/fields.go @@ -454,6 +454,13 @@ on revocation entries. This helps increase the performance of CRL building and OCSP responses.`, } + fields["tidy_expired_issuers"] = &framework.FieldSchema{ + Type: framework.TypeBool, + Description: `Set to true to automatically remove expired issuers +past the issuer_safety_buffer. No keys will be removed as part of this +operation.`, + } + fields["safety_buffer"] = &framework.FieldSchema{ Type: framework.TypeDurationSecond, Description: `The amount of extra time that must have passed @@ -463,6 +470,15 @@ Defaults to 72 hours.`, Default: int(defaultTidyConfig.SafetyBuffer / time.Second), // TypeDurationSecond currently requires defaults to be int } + fields["issuer_safety_buffer"] = &framework.FieldSchema{ + Type: framework.TypeDurationSecond, + Description: `The amount of extra time that must have passed +beyond issuer's expiration before it is removed +from the backend storage. +Defaults to 8760 hours (1 year).`, + Default: int(defaultTidyConfig.IssuerSafetyBuffer / time.Second), // TypeDurationSecond currently requires defaults to be int + } + fields["pause_duration"] = &framework.FieldSchema{ Type: framework.TypeString, Description: `The amount of time to wait between processing diff --git a/builtin/logical/pki/path_tidy.go b/builtin/logical/pki/path_tidy.go index 82970d3aa..ae5f160ec 100644 --- a/builtin/logical/pki/path_tidy.go +++ b/builtin/logical/pki/path_tidy.go @@ -13,29 +13,34 @@ import ( "github.com/hashicorp/go-hclog" "github.com/hashicorp/vault/sdk/framework" + "github.com/hashicorp/vault/sdk/helper/consts" "github.com/hashicorp/vault/sdk/logical" ) var tidyCancelledError = errors.New("tidy operation cancelled") type tidyConfig struct { - Enabled bool `json:"enabled"` - Interval time.Duration `json:"interval_duration"` - CertStore bool `json:"tidy_cert_store"` - RevokedCerts bool `json:"tidy_revoked_certs"` - IssuerAssocs bool `json:"tidy_revoked_cert_issuer_associations"` - SafetyBuffer time.Duration `json:"safety_buffer"` - PauseDuration time.Duration `json:"pause_duration"` + Enabled bool `json:"enabled"` + Interval time.Duration `json:"interval_duration"` + CertStore bool `json:"tidy_cert_store"` + RevokedCerts bool `json:"tidy_revoked_certs"` + IssuerAssocs bool `json:"tidy_revoked_cert_issuer_associations"` + ExpiredIssuers bool `json:"tidy_expired_issuers"` + SafetyBuffer time.Duration `json:"safety_buffer"` + IssuerSafetyBuffer time.Duration `json:"issuer_safety_buffer"` + PauseDuration time.Duration `json:"pause_duration"` } var defaultTidyConfig = tidyConfig{ - Enabled: false, - Interval: 12 * time.Hour, - CertStore: false, - RevokedCerts: false, - IssuerAssocs: false, - SafetyBuffer: 72 * time.Hour, - PauseDuration: 0 * time.Second, + Enabled: false, + Interval: 12 * time.Hour, + CertStore: false, + RevokedCerts: false, + IssuerAssocs: false, + ExpiredIssuers: false, + SafetyBuffer: 72 * time.Hour, + IssuerSafetyBuffer: 365 * 24 * time.Hour, + PauseDuration: 0 * time.Second, } func pathTidy(b *backend) *framework.Path { @@ -116,6 +121,8 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr tidyCertStore := d.Get("tidy_cert_store").(bool) tidyRevokedCerts := d.Get("tidy_revoked_certs").(bool) || d.Get("tidy_revocation_list").(bool) tidyRevokedAssocs := d.Get("tidy_revoked_cert_issuer_associations").(bool) + tidyExpiredIssuers := d.Get("tidy_expired_issuers").(bool) + issuerSafetyBuffer := d.Get("issuer_safety_buffer").(int) pauseDurationStr := d.Get("pause_duration").(string) pauseDuration := 0 * time.Second @@ -123,6 +130,10 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr return logical.ErrorResponse("safety_buffer must be greater than zero"), nil } + if issuerSafetyBuffer < 1 { + return logical.ErrorResponse("issuer_safety_buffer must be greater than zero"), nil + } + if pauseDurationStr != "" { var err error pauseDuration, err = time.ParseDuration(pauseDurationStr) @@ -136,16 +147,19 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr } bufferDuration := time.Duration(safetyBuffer) * time.Second + issuerBufferDuration := time.Duration(issuerSafetyBuffer) * time.Second // Manual run with constructed configuration. config := &tidyConfig{ - Enabled: true, - Interval: 0 * time.Second, - CertStore: tidyCertStore, - RevokedCerts: tidyRevokedCerts, - IssuerAssocs: tidyRevokedAssocs, - SafetyBuffer: bufferDuration, - PauseDuration: pauseDuration, + Enabled: true, + Interval: 0 * time.Second, + CertStore: tidyCertStore, + RevokedCerts: tidyRevokedCerts, + IssuerAssocs: tidyRevokedAssocs, + ExpiredIssuers: tidyExpiredIssuers, + SafetyBuffer: bufferDuration, + IssuerSafetyBuffer: issuerBufferDuration, + PauseDuration: pauseDuration, } if !atomic.CompareAndSwapUint32(b.tidyCASGuard, 0, 1) { @@ -170,8 +184,8 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr b.startTidyOperation(req, config) resp := &logical.Response{} - if !tidyCertStore && !tidyRevokedCerts && !tidyRevokedAssocs { - resp.AddWarning("No targets to tidy; specify tidy_cert_store=true or tidy_revoked_certs=true or tidy_revoked_cert_issuer_associations=true to start a tidy operation.") + if !tidyCertStore && !tidyRevokedCerts && !tidyRevokedAssocs && !tidyExpiredIssuers { + resp.AddWarning("No targets to tidy; specify tidy_cert_store=true or tidy_revoked_certs=true or tidy_revoked_cert_issuer_associations=true or tidy_expired_issuers=true to start a tidy operation.") } else { resp.AddWarning("Tidy operation successfully started. Any information from the operation will be printed to Vault's server logs.") } @@ -209,6 +223,12 @@ func (b *backend) startTidyOperation(req *logical.Request, config *tidyConfig) { } } + if config.ExpiredIssuers { + if err := b.doTidyExpiredIssuers(ctx, req, logger, config); err != nil { + return err + } + } + return nil } @@ -440,6 +460,111 @@ func (b *backend) doTidyRevocationStore(ctx context.Context, req *logical.Reques return nil } +func (b *backend) doTidyExpiredIssuers(ctx context.Context, req *logical.Request, logger hclog.Logger, config *tidyConfig) error { + if b.System().ReplicationState().HasState(consts.ReplicationDRSecondary|consts.ReplicationPerformanceStandby) || + (!b.System().LocalMount() && b.System().ReplicationState().HasState(consts.ReplicationPerformanceSecondary)) { + b.Logger().Debug("skipping expired issuer tidy as we're not on the primary or secondary with a local mount") + return nil + } + + // Short-circuit to avoid having to deal with the legacy mounts. While we + // could handle this case and remove these issuers, its somewhat + // unexpected behavior and we'd prefer to finish the migration first. + if b.useLegacyBundleCaStorage() { + return nil + } + + b.issuersLock.Lock() + defer b.issuersLock.Unlock() + + // Fetch and parse our issuers so we have their expiration date. + sc := b.makeStorageContext(ctx, req.Storage) + issuerIDCertMap, err := fetchIssuerMapForRevocationChecking(sc) + if err != nil { + return err + } + + // Fetch the issuer config to find the default; we don't want to remove + // the current active issuer automatically. + iConfig, err := sc.getIssuersConfig() + if err != nil { + return err + } + + // We want certificates which have expired before this date by a given + // safety buffer. So we subtract the buffer from now, and anything which + // has expired before our after buffer can be tidied, and anything that + // expired after this buffer must be kept. + now := time.Now() + afterBuffer := now.Add(-1 * config.IssuerSafetyBuffer) + + rebuildChainsAndCRL := false + + for issuer, cert := range issuerIDCertMap { + if cert.NotAfter.After(afterBuffer) { + continue + } + + entry, err := sc.fetchIssuerById(issuer) + if err != nil { + return nil + } + + // This issuer's certificate has expired. We explicitly persist the + // key, but log both the certificate and the keyId to the + // informational logs so an admin can recover the removed cert if + // necessary or remove the key (and know which cert it belonged to), + // if desired. + msg := "[Tidy on mount: %v] Issuer %v has expired by %v and is being removed." + idAndName := fmt.Sprintf("[id:%v/name:%v]", entry.ID, entry.Name) + msg = fmt.Sprintf(msg, b.backendUUID, idAndName, config.IssuerSafetyBuffer) + + // Before we log, check if we're the default. While this is late, and + // after we read it from storage, we have more info here to tell the + // user that their default has expired AND has passed the safety + // buffer. + if iConfig.DefaultIssuerId == issuer { + msg = "[Tidy on mount: %v] Issuer %v has expired and would be removed via tidy, but won't be, as it is currently the default issuer." + msg = fmt.Sprintf(msg, b.backendUUID, idAndName) + b.Logger().Warn(msg) + continue + } + + // Log the above message.. + b.Logger().Info(msg, "serial_number", entry.SerialNumber, "key_id", entry.KeyID, "certificate", entry.Certificate) + + wasDefault, err := sc.deleteIssuer(issuer) + if err != nil { + b.Logger().Error(fmt.Sprintf("failed to remove %v: %v", idAndName, err)) + return err + } + if wasDefault { + b.Logger().Warn(fmt.Sprintf("expired issuer %v was default; it is strongly encouraged to choose a new default issuer for backwards compatibility", idAndName)) + } + + rebuildChainsAndCRL = true + } + + if rebuildChainsAndCRL { + // When issuers are removed, there's a chance chains change as a + // result; remove them. + if err := sc.rebuildIssuersChains(nil); err != nil { + return err + } + + // Removal of issuers is generally a good reason to rebuild the CRL, + // even if auto-rebuild is enabled. + b.revokeStorageLock.Lock() + defer b.revokeStorageLock.Unlock() + + if err := b.crlBuilder.rebuild(ctx, b, req, false); err != nil { + return err + } + } + + return nil +} + func (b *backend) pathTidyCancelWrite(ctx context.Context, req *logical.Request, d *framework.FieldData) (*logical.Response, error) { if atomic.LoadUint32(b.tidyCASGuard) == 0 { resp := &logical.Response{} @@ -470,9 +595,11 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f resp := &logical.Response{ Data: map[string]interface{}{ "safety_buffer": nil, + "issuer_safety_buffer": nil, "tidy_cert_store": nil, "tidy_revoked_certs": nil, "tidy_revoked_cert_issuer_associations": nil, + "tidy_expired_issuers": nil, "pause_duration": nil, "state": "Inactive", "error": nil, @@ -492,9 +619,11 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f } resp.Data["safety_buffer"] = b.tidyStatus.safetyBuffer + resp.Data["issuer_safety_buffer"] = b.tidyStatus.issuerSafetyBuffer resp.Data["tidy_cert_store"] = b.tidyStatus.tidyCertStore resp.Data["tidy_revoked_certs"] = b.tidyStatus.tidyRevokedCerts resp.Data["tidy_revoked_cert_issuer_associations"] = b.tidyStatus.tidyRevokedAssocs + resp.Data["tidy_expired_issuers"] = b.tidyStatus.tidyExpiredIssuers resp.Data["pause_duration"] = b.tidyStatus.pauseDuration resp.Data["time_started"] = b.tidyStatus.timeStarted resp.Data["message"] = b.tidyStatus.message @@ -547,7 +676,9 @@ func (b *backend) pathConfigAutoTidyRead(ctx context.Context, req *logical.Reque "tidy_cert_store": config.CertStore, "tidy_revoked_certs": config.RevokedCerts, "tidy_revoked_cert_issuer_associations": config.IssuerAssocs, + "tidy_expired_issuers": config.ExpiredIssuers, "safety_buffer": int(config.SafetyBuffer / time.Second), + "issuer_safety_buffer": int(config.IssuerSafetyBuffer / time.Second), "pause_duration": config.PauseDuration.String(), }, }, nil @@ -613,11 +744,13 @@ func (b *backend) tidyStatusStart(config *tidyConfig) { defer b.tidyStatusLock.Unlock() b.tidyStatus = &tidyStatus{ - safetyBuffer: int(config.SafetyBuffer / time.Second), - tidyCertStore: config.CertStore, - tidyRevokedCerts: config.RevokedCerts, - tidyRevokedAssocs: config.IssuerAssocs, - pauseDuration: config.PauseDuration.String(), + safetyBuffer: int(config.SafetyBuffer / time.Second), + issuerSafetyBuffer: int(config.IssuerSafetyBuffer / time.Second), + tidyCertStore: config.CertStore, + tidyRevokedCerts: config.RevokedCerts, + tidyRevokedAssocs: config.IssuerAssocs, + tidyExpiredIssuers: config.ExpiredIssuers, + pauseDuration: config.PauseDuration.String(), state: tidyStatusStarted, timeStarted: time.Now(), diff --git a/builtin/logical/pki/path_tidy_test.go b/builtin/logical/pki/path_tidy_test.go index be950f865..f00def8cc 100644 --- a/builtin/logical/pki/path_tidy_test.go +++ b/builtin/logical/pki/path_tidy_test.go @@ -264,3 +264,110 @@ func TestTidyCancellation(t *testing.T) { t.Fatalf("expected to only process at most 3 more certificates, but processed (%v >>> %v) certs", nowMany, howMany) } } + +func TestTidyIssuers(t *testing.T) { + t.Parallel() + + b, s := createBackendWithStorage(t) + + // Create a root that expires quickly and one valid for longer. + _, err := CBWrite(b, s, "root/generate/internal", map[string]interface{}{ + "common_name": "root1 example.com", + "issuer_name": "root-expired", + "ttl": "1s", + "key_type": "ec", + }) + require.NoError(t, err) + + _, err = CBWrite(b, s, "root/generate/internal", map[string]interface{}{ + "common_name": "root2 example.com", + "issuer_name": "root-valid", + "ttl": "60m", + "key_type": "rsa", + }) + require.NoError(t, err) + + // Sleep long enough to expire the root. + time.Sleep(2 * time.Second) + + // First tidy run shouldn't remove anything; too long of safety buffer. + _, err = CBWrite(b, s, "tidy", map[string]interface{}{ + "tidy_expired_issuers": true, + "issuer_safety_buffer": "60m", + }) + require.NoError(t, err) + + // Wait for tidy to finish. + time.Sleep(2 * time.Second) + + // Expired issuer should exist. + resp, err := CBRead(b, s, "issuer/root-expired") + requireSuccessNonNilResponse(t, resp, err, "expired should still be present") + resp, err = CBRead(b, s, "issuer/root-valid") + requireSuccessNonNilResponse(t, resp, err, "valid should still be present") + + // Second tidy run with shorter safety buffer shouldn't remove the + // expired one, as it should be the default issuer. + _, err = CBWrite(b, s, "tidy", map[string]interface{}{ + "tidy_expired_issuers": true, + "issuer_safety_buffer": "1s", + }) + require.NoError(t, err) + + // Wait for tidy to finish. + time.Sleep(2 * time.Second) + + // Expired issuer should still exist. + resp, err = CBRead(b, s, "issuer/root-expired") + requireSuccessNonNilResponse(t, resp, err, "expired should still be present") + resp, err = CBRead(b, s, "issuer/root-valid") + requireSuccessNonNilResponse(t, resp, err, "valid should still be present") + + // Update the default issuer. + _, err = CBWrite(b, s, "config/issuers", map[string]interface{}{ + "default": "root-valid", + }) + require.NoError(t, err) + + // Third tidy run should remove the expired one. + _, err = CBWrite(b, s, "tidy", map[string]interface{}{ + "tidy_expired_issuers": true, + "issuer_safety_buffer": "1s", + }) + require.NoError(t, err) + + // Wait for tidy to finish. + time.Sleep(2 * time.Second) + + // Valid issuer should exist still; other should be removed. + resp, err = CBRead(b, s, "issuer/root-expired") + require.Error(t, err) + require.Nil(t, resp) + resp, err = CBRead(b, s, "issuer/root-valid") + requireSuccessNonNilResponse(t, resp, err, "valid should still be present") + + // Finally, one more tidy should cause no changes. + _, err = CBWrite(b, s, "tidy", map[string]interface{}{ + "tidy_expired_issuers": true, + "issuer_safety_buffer": "1s", + }) + require.NoError(t, err) + + // Wait for tidy to finish. + time.Sleep(2 * time.Second) + + // Valid issuer should exist still; other should be removed. + resp, err = CBRead(b, s, "issuer/root-expired") + require.Error(t, err) + require.Nil(t, resp) + resp, err = CBRead(b, s, "issuer/root-valid") + requireSuccessNonNilResponse(t, resp, err, "valid should still be present") + + // Ensure we have safety buffer and expired issuers set correctly. + statusResp, err := CBRead(b, s, "tidy-status") + require.NoError(t, err) + require.NotNil(t, statusResp) + require.NotNil(t, statusResp.Data) + require.Equal(t, statusResp.Data["issuer_safety_buffer"], 1) + require.Equal(t, statusResp.Data["tidy_expired_issuers"], true) +} diff --git a/builtin/logical/pki/storage.go b/builtin/logical/pki/storage.go index fc1c83e32..e55ea4121 100644 --- a/builtin/logical/pki/storage.go +++ b/builtin/logical/pki/storage.go @@ -1222,6 +1222,10 @@ func (sc *storageContext) getAutoTidyConfig() (*tidyConfig, error) { return nil, err } + if result.IssuerSafetyBuffer == 0 { + result.IssuerSafetyBuffer = defaultTidyConfig.IssuerSafetyBuffer + } + return &result, nil } diff --git a/changelog/17823.txt b/changelog/17823.txt new file mode 100644 index 000000000..d999c8794 --- /dev/null +++ b/changelog/17823.txt @@ -0,0 +1,3 @@ +```release-note:improvement +secrets/pki: Allow tidying of expired issuer certificates. +``` diff --git a/website/content/api-docs/secret/pki.mdx b/website/content/api-docs/secret/pki.mdx index e937bbd08..a8851c73c 100644 --- a/website/content/api-docs/secret/pki.mdx +++ b/website/content/api-docs/secret/pki.mdx @@ -3325,6 +3325,15 @@ expiration time. performance of OCSP and CRL building, by shifting work to a tidy operation instead. +- `tidy_expired_issuers` `(bool: false)` - Set to true to automatically remove + expired issuers after the `issuer_safety_buffer` duration has elapsed. We + log the issuer certificate on removal to allow recovery; no keys are removed + during this process. + +~> Note: When the default issuer expires and is tidied, a new default must be + chosen manually by the operator; this process will not select a new one + automatically. + - `safety_buffer` `(string: "")` - Specifies a duration using [duration format strings](/docs/concepts/duration-format) used as a safety buffer to ensure certificates are not expunged prematurely; as an example, this can keep certificates from being removed from the CRL that, due to clock skew, might @@ -3332,6 +3341,10 @@ expiration time. the time must be after the expiration time of the certificate (according to the local clock) plus the duration of `safety_buffer`. Defaults to `72h`. +- `issuer_safety_buffer` `(string: "")` - Specifies a duration that issuers + should be kept for, past their `NotAfter` validity period. Defaults to + 365 days as hours (`8760h`). + - `pause_duration` `(string: "0s")` - Specifies the duration to pause between tidying individual certificates. This releases the revocation lock and allows other operations to continue while tidy is running. @@ -3340,6 +3353,8 @@ expiration time. between reading, parsing, and updates on-disk cert entries will be increased, decreasing resource utilization. + Does not affect `tidy_expired_issuers`. + ~> Note: Using too long of a `pause_duration` can result in tidy operations not concluding during this lifetime! Using too short of a pause duration (but non-zero) can lead to lock contention. Use [tidy's cancellation](#cancel-tidy)