diff --git a/builtin/logical/pki/backend.go b/builtin/logical/pki/backend.go index 71cbd7b6f..5a8d00d29 100644 --- a/builtin/logical/pki/backend.go +++ b/builtin/logical/pki/backend.go @@ -131,6 +131,7 @@ func Backend(conf *logical.BackendConfig) *backend { pathRevoke(&b), pathRevokeWithKey(&b), pathTidy(&b), + pathTidyCancel(&b), pathTidyStatus(&b), pathConfigAutoTidy(&b), @@ -184,6 +185,7 @@ func Backend(conf *logical.BackendConfig) *backend { } b.tidyCASGuard = new(uint32) + b.tidyCancelCAS = new(uint32) b.tidyStatus = &tidyStatus{state: tidyStatusInactive} b.storage = conf.StorageView b.backendUUID = conf.BackendUUID @@ -205,6 +207,7 @@ type backend struct { storage logical.Storage revokeStorageLock sync.RWMutex tidyCASGuard *uint32 + tidyCancelCAS *uint32 tidyStatusLock sync.RWMutex tidyStatus *tidyStatus @@ -223,10 +226,12 @@ type ( ) const ( - tidyStatusInactive tidyStatusState = iota - tidyStatusStarted - tidyStatusFinished - tidyStatusError + tidyStatusInactive tidyStatusState = iota + tidyStatusStarted = iota + tidyStatusFinished = iota + tidyStatusError = iota + tidyStatusCancelling = iota + tidyStatusCancelled = iota ) type tidyStatus struct { @@ -235,6 +240,7 @@ type tidyStatus struct { tidyCertStore bool tidyRevokedCerts bool tidyRevokedAssocs bool + pauseDuration string // Status state tidyStatusState diff --git a/builtin/logical/pki/backend_test.go b/builtin/logical/pki/backend_test.go index 9f5b5b91d..7517ab35d 100644 --- a/builtin/logical/pki/backend_test.go +++ b/builtin/logical/pki/backend_test.go @@ -3877,6 +3877,7 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) { "tidy_cert_store": true, "tidy_revoked_certs": true, "tidy_revoked_cert_issuer_associations": false, + "pause_duration": "0s", "state": "Finished", "error": nil, "time_started": nil, diff --git a/builtin/logical/pki/fields.go b/builtin/logical/pki/fields.go index d977735e9..497ea8893 100644 --- a/builtin/logical/pki/fields.go +++ b/builtin/logical/pki/fields.go @@ -463,5 +463,17 @@ Defaults to 72 hours.`, Default: int(defaultTidyConfig.SafetyBuffer / time.Second), // TypeDurationSecond currently requires defaults to be int } + fields["pause_duration"] = &framework.FieldSchema{ + Type: framework.TypeString, + Description: `The amount of time to wait between processing +certificates. This allows operators to change the execution profile +of tidy to take consume less resources by slowing down how long it +takes to run. Note that the entire list of certificates will be +stored in memory during the entire tidy operation, but resources to +read/process/update existing entries will be spread out over a +greater period of time. By default this is zero seconds.`, + Default: "0s", + } + return fields } diff --git a/builtin/logical/pki/path_tidy.go b/builtin/logical/pki/path_tidy.go index 15231a280..5d3114444 100644 --- a/builtin/logical/pki/path_tidy.go +++ b/builtin/logical/pki/path_tidy.go @@ -3,6 +3,7 @@ package pki import ( "context" "crypto/x509" + "errors" "fmt" "net/http" "sync/atomic" @@ -16,22 +17,26 @@ import ( "github.com/hashicorp/vault/sdk/logical" ) +var tidyCancelledError = errors.New("tidy operation cancelled") + type tidyConfig struct { - Enabled bool `json:"enabled"` - Interval time.Duration `json:"interval_duration"` - CertStore bool `json:"tidy_cert_store"` - RevokedCerts bool `json:"tidy_revoked_certs"` - IssuerAssocs bool `json:"tidy_revoked_cert_issuer_associations"` - SafetyBuffer time.Duration `json:"safety_buffer"` + Enabled bool `json:"enabled"` + Interval time.Duration `json:"interval_duration"` + CertStore bool `json:"tidy_cert_store"` + RevokedCerts bool `json:"tidy_revoked_certs"` + IssuerAssocs bool `json:"tidy_revoked_cert_issuer_associations"` + SafetyBuffer time.Duration `json:"safety_buffer"` + PauseDuration time.Duration `json:"pause_duration"` } var defaultTidyConfig = tidyConfig{ - Enabled: false, - Interval: 12 * time.Hour, - CertStore: false, - RevokedCerts: false, - IssuerAssocs: false, - SafetyBuffer: 72 * time.Hour, + Enabled: false, + Interval: 12 * time.Hour, + CertStore: false, + RevokedCerts: false, + IssuerAssocs: false, + SafetyBuffer: 72 * time.Hour, + PauseDuration: 0 * time.Second, } func pathTidy(b *backend) *framework.Path { @@ -49,6 +54,20 @@ func pathTidy(b *backend) *framework.Path { } } +func pathTidyCancel(b *backend) *framework.Path { + return &framework.Path{ + Pattern: "tidy-cancel$", + Operations: map[logical.Operation]framework.OperationHandler{ + logical.UpdateOperation: &framework.PathOperation{ + Callback: b.pathTidyCancelWrite, + ForwardPerformanceStandby: true, + }, + }, + HelpSynopsis: pathTidyCancelHelpSyn, + HelpDescription: pathTidyCancelHelpDesc, + } +} + func pathTidyStatus(b *backend) *framework.Path { return &framework.Path{ Pattern: "tidy-status$", @@ -98,21 +117,36 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr tidyCertStore := d.Get("tidy_cert_store").(bool) tidyRevokedCerts := d.Get("tidy_revoked_certs").(bool) || d.Get("tidy_revocation_list").(bool) tidyRevokedAssocs := d.Get("tidy_revoked_cert_issuer_associations").(bool) + pauseDurationStr := d.Get("pause_duration").(string) + pauseDuration := 0 * time.Second if safetyBuffer < 1 { return logical.ErrorResponse("safety_buffer must be greater than zero"), nil } + if pauseDurationStr != "" { + var err error + pauseDuration, err = time.ParseDuration(pauseDurationStr) + if err != nil { + return logical.ErrorResponse(fmt.Sprintf("Error parsing pause_duration: %v", err)), nil + } + + if pauseDuration < (0 * time.Second) { + return logical.ErrorResponse("received invalid, negative pause_duration"), nil + } + } + bufferDuration := time.Duration(safetyBuffer) * time.Second // Manual run with constructed configuration. config := &tidyConfig{ - Enabled: true, - Interval: 0 * time.Second, - CertStore: tidyCertStore, - RevokedCerts: tidyRevokedCerts, - IssuerAssocs: tidyRevokedAssocs, - SafetyBuffer: bufferDuration, + Enabled: true, + Interval: 0 * time.Second, + CertStore: tidyCertStore, + RevokedCerts: tidyRevokedCerts, + IssuerAssocs: tidyRevokedAssocs, + SafetyBuffer: bufferDuration, + PauseDuration: pauseDuration, } if !atomic.CompareAndSwapUint32(b.tidyCASGuard, 0, 1) { @@ -148,6 +182,7 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr func (b *backend) startTidyOperation(req *logical.Request, config *tidyConfig) { go func() { + atomic.StoreUint32(b.tidyCancelCAS, 0) defer atomic.StoreUint32(b.tidyCASGuard, 0) b.tidyStatusStart(config) @@ -164,9 +199,14 @@ func (b *backend) startTidyOperation(req *logical.Request, config *tidyConfig) { } } + // Check for cancel before continuing. + if atomic.CompareAndSwapUint32(b.tidyCancelCAS, 1, 0) { + return tidyCancelledError + } + if config.RevokedCerts || config.IssuerAssocs { if err := b.doTidyRevocationStore(ctx, req, logger, config); err != nil { - return nil + return err } } @@ -201,6 +241,16 @@ func (b *backend) doTidyCertStore(ctx context.Context, req *logical.Request, log b.tidyStatusMessage(fmt.Sprintf("Tidying certificate store: checking entry %d of %d", i, serialCount)) metrics.SetGauge([]string{"secrets", "pki", "tidy", "cert_store_current_entry"}, float32(i)) + // Check for cancel before continuing. + if atomic.CompareAndSwapUint32(b.tidyCancelCAS, 1, 0) { + return tidyCancelledError + } + + // Check for pause duration to reduce resource consumption. + if config.PauseDuration > (0 * time.Second) { + time.Sleep(config.PauseDuration) + } + certEntry, err := req.Storage.Get(ctx, "certs/"+serial) if err != nil { return fmt.Errorf("error fetching certificate %q: %w", serial, err) @@ -272,6 +322,18 @@ func (b *backend) doTidyRevocationStore(ctx context.Context, req *logical.Reques b.tidyStatusMessage(fmt.Sprintf("Tidying revoked certificates: checking certificate %d of %d", i, len(revokedSerials))) metrics.SetGauge([]string{"secrets", "pki", "tidy", "revoked_cert_current_entry"}, float32(i)) + // Check for cancel before continuing. + if atomic.CompareAndSwapUint32(b.tidyCancelCAS, 1, 0) { + return tidyCancelledError + } + + // Check for pause duration to reduce resource consumption. + if config.PauseDuration > (0 * time.Second) { + b.revokeStorageLock.Unlock() + time.Sleep(config.PauseDuration) + b.revokeStorageLock.Lock() + } + revokedEntry, err := req.Storage.Get(ctx, "revoked/"+serial) if err != nil { return fmt.Errorf("unable to fetch revoked cert with serial %q: %w", serial, err) @@ -379,6 +441,33 @@ func (b *backend) doTidyRevocationStore(ctx context.Context, req *logical.Reques return nil } +func (b *backend) pathTidyCancelWrite(ctx context.Context, req *logical.Request, d *framework.FieldData) (*logical.Response, error) { + if b.System().ReplicationState().HasState(consts.ReplicationPerformanceSecondary) && !b.System().LocalMount() { + return nil, logical.ErrReadOnly + } + + if atomic.LoadUint32(b.tidyCASGuard) == 0 { + resp := &logical.Response{} + resp.AddWarning("Tidy operation cannot be cancelled as none is currently running.") + return resp, nil + } + + // Grab the status lock before writing the cancel atomic. This lets us + // update the status correctly as well, avoiding writing it if we're not + // presently running. + // + // Unlock needs to occur prior to calling read. + b.tidyStatusLock.Lock() + if b.tidyStatus.state == tidyStatusStarted || atomic.LoadUint32(b.tidyCASGuard) == 1 { + if atomic.CompareAndSwapUint32(b.tidyCancelCAS, 0, 1) { + b.tidyStatus.state = tidyStatusCancelling + } + } + b.tidyStatusLock.Unlock() + + return b.pathTidyStatusRead(ctx, req, d) +} + func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *framework.FieldData) (*logical.Response, error) { // If this node is a performance secondary return an ErrReadOnly so that the request gets forwarded, // but only if the PKI backend is not a local mount. @@ -391,17 +480,19 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f resp := &logical.Response{ Data: map[string]interface{}{ - "safety_buffer": nil, - "tidy_cert_store": nil, - "tidy_revoked_certs": nil, - "state": "Inactive", - "error": nil, - "time_started": nil, - "time_finished": nil, - "message": nil, - "cert_store_deleted_count": nil, - "revoked_cert_deleted_count": nil, - "missing_issuer_cert_count": nil, + "safety_buffer": nil, + "tidy_cert_store": nil, + "tidy_revoked_certs": nil, + "tidy_revoked_cert_issuer_associations": nil, + "pause_duration": nil, + "state": "Inactive", + "error": nil, + "time_started": nil, + "time_finished": nil, + "message": nil, + "cert_store_deleted_count": nil, + "revoked_cert_deleted_count": nil, + "missing_issuer_cert_count": nil, }, } @@ -413,6 +504,7 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f resp.Data["tidy_cert_store"] = b.tidyStatus.tidyCertStore resp.Data["tidy_revoked_certs"] = b.tidyStatus.tidyRevokedCerts resp.Data["tidy_revoked_cert_issuer_associations"] = b.tidyStatus.tidyRevokedAssocs + resp.Data["pause_duration"] = b.tidyStatus.pauseDuration resp.Data["time_started"] = b.tidyStatus.timeStarted resp.Data["message"] = b.tidyStatus.message resp.Data["cert_store_deleted_count"] = b.tidyStatus.certStoreDeletedCount @@ -432,6 +524,11 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f resp.Data["error"] = b.tidyStatus.err.Error() // Don't clear the message so that it serves as a hint about when // the error occurred. + case tidyStatusCancelling: + resp.Data["state"] = "Cancelling" + case tidyStatusCancelled: + resp.Data["state"] = "Cancelled" + resp.Data["time_finished"] = b.tidyStatus.timeFinished } return resp, nil @@ -452,6 +549,7 @@ func (b *backend) pathConfigAutoTidyRead(ctx context.Context, req *logical.Reque "tidy_revoked_certs": config.RevokedCerts, "tidy_revoked_cert_issuer_associations": config.IssuerAssocs, "safety_buffer": int(config.SafetyBuffer / time.Second), + "pause_duration": config.PauseDuration.String(), }, }, nil } @@ -493,6 +591,17 @@ func (b *backend) pathConfigAutoTidyWrite(ctx context.Context, req *logical.Requ } } + if pauseDurationRaw, ok := d.GetOk("pause_duration"); ok { + config.PauseDuration, err = time.ParseDuration(pauseDurationRaw.(string)) + if err != nil { + return logical.ErrorResponse(fmt.Sprintf("unable to parse given pause_duration: %v", err)), nil + } + + if config.PauseDuration < (0 * time.Second) { + return logical.ErrorResponse("received invalid, negative pause_duration"), nil + } + } + if config.Enabled && !(config.CertStore || config.RevokedCerts || config.IssuerAssocs) { return logical.ErrorResponse(fmt.Sprintf("Auto-tidy enabled but no tidy operations were requested. Enable at least one tidy operation to be run (tidy_cert_store / tidy_revoked_certs / tidy_revoked_cert_issuer_associations).")), nil } @@ -509,8 +618,10 @@ func (b *backend) tidyStatusStart(config *tidyConfig) { tidyCertStore: config.CertStore, tidyRevokedCerts: config.RevokedCerts, tidyRevokedAssocs: config.IssuerAssocs, - state: tidyStatusStarted, - timeStarted: time.Now(), + pauseDuration: config.PauseDuration.String(), + + state: tidyStatusStarted, + timeStarted: time.Now(), } metrics.SetGauge([]string{"secrets", "pki", "tidy", "start_time_epoch"}, float32(b.tidyStatus.timeStarted.Unix())) @@ -524,6 +635,8 @@ func (b *backend) tidyStatusStop(err error) { b.tidyStatus.err = err if err == nil { b.tidyStatus.state = tidyStatusFinished + } else if err == tidyCancelledError { + b.tidyStatus.state = tidyStatusCancelled } else { b.tidyStatus.state = tidyStatusError } @@ -595,6 +708,18 @@ current time, minus the value of 'safety_buffer', is greater than the expiration, it will be removed. ` +const pathTidyCancelHelpSyn = ` +Cancels a currently running tidy operation. +` + +const pathTidyCancelHelpDesc = ` +This endpoint allows cancelling a currently running tidy operation. + +Periodically throughout the invocation of tidy, we'll check if the operation +has been requested to be cancelled. If so, we'll stop the currently running +tidy operation. +` + const pathTidyStatusHelpSyn = ` Returns the status of the tidy operation. ` @@ -619,6 +744,16 @@ The result includes the following fields: * 'missing_issuer_cert_count': The number of revoked certificates which were missing a valid issuer reference ` -const pathConfigAutoTidySyn = `` +const pathConfigAutoTidySyn = ` +Modifies the current configuration for automatic tidy execution. +` -const pathConfigAutoTidyDesc = `` +const pathConfigAutoTidyDesc = ` +This endpoint accepts parameters to a tidy operation (see /tidy) that +will be used for automatic tidy execution. This takes two extra parameters, +enabled (to enable or disable auto-tidy) and interval_duration (which +controls the frequency of auto-tidy execution). + +Once enabled, a tidy operation will be kicked off automatically, as if it +were executed with the posted configuration. +` diff --git a/builtin/logical/pki/path_tidy_test.go b/builtin/logical/pki/path_tidy_test.go index 2db8f6c4f..661959867 100644 --- a/builtin/logical/pki/path_tidy_test.go +++ b/builtin/logical/pki/path_tidy_test.go @@ -155,3 +155,81 @@ func TestAutoTidy(t *testing.T) { require.Nil(t, err) require.Nil(t, resp) } + +func TestTidyCancellation(t *testing.T) { + t.Parallel() + + numLeaves := 100 + + b, s := createBackendWithStorage(t) + + // Create a root, a role, and a bunch of leaves. + _, err := CBWrite(b, s, "root/generate/internal", map[string]interface{}{ + "common_name": "root example.com", + "issuer_name": "root", + "ttl": "20m", + "key_type": "ec", + }) + require.NoError(t, err) + _, err = CBWrite(b, s, "roles/local-testing", map[string]interface{}{ + "allow_any_name": true, + "enforce_hostnames": false, + "key_type": "ec", + }) + require.NoError(t, err) + for i := 0; i < numLeaves; i++ { + _, err = CBWrite(b, s, "issue/local-testing", map[string]interface{}{ + "common_name": "testing", + "ttl": "1s", + }) + require.NoError(t, err) + } + + // Kick off a tidy operation (which runs in the background), but with + // a slow-ish pause between certificates. + _, err = CBWrite(b, s, "tidy", map[string]interface{}{ + "tidy_cert_store": true, + "safety_buffer": "1s", + "pause_duration": "1s", + }) + + // If we wait six seconds, the operation should still be running. That's + // how we check that pause_duration works. + time.Sleep(3 * time.Second) + + resp, err := CBRead(b, s, "tidy-status") + require.NoError(t, err) + require.NotNil(t, resp) + require.NotNil(t, resp.Data) + require.Equal(t, resp.Data["state"], "Running") + + // If we now cancel the operation, the response should say Cancelling. + cancelResp, err := CBWrite(b, s, "tidy-cancel", map[string]interface{}{}) + require.NoError(t, err) + require.NotNil(t, cancelResp) + require.NotNil(t, cancelResp.Data) + state := cancelResp.Data["state"].(string) + howMany := cancelResp.Data["cert_store_deleted_count"].(uint) + + if state == "Cancelled" { + // Rest of the test can't run; log and exit. + t.Log("Went to cancel the operation but response was already cancelled") + return + } + + require.Equal(t, state, "Cancelling") + + // Wait a little longer, and ensure we only processed at most 2 more certs + // after the cancellation respon. + time.Sleep(3 * time.Second) + + statusResp, err := CBRead(b, s, "tidy-status") + require.NoError(t, err) + require.NotNil(t, statusResp) + require.NotNil(t, statusResp.Data) + require.Equal(t, statusResp.Data["state"], "Cancelled") + nowMany := statusResp.Data["cert_store_deleted_count"].(uint) + if howMany+3 <= nowMany { + t.Fatalf("expected to only process at most 3 more certificates, but processed (%v >>> %v) certs", nowMany, howMany) + } +} diff --git a/changelog/16958.txt b/changelog/16958.txt new file mode 100644 index 000000000..a77af9a7b --- /dev/null +++ b/changelog/16958.txt @@ -0,0 +1,3 @@ +```release-note:improvement +secrets/pki: Add ability to cancel tidy operations, control tidy resource usage. +``` diff --git a/website/content/api-docs/secret/pki.mdx b/website/content/api-docs/secret/pki.mdx index 112cf53ef..d57bfb8ef 100644 --- a/website/content/api-docs/secret/pki.mdx +++ b/website/content/api-docs/secret/pki.mdx @@ -70,6 +70,7 @@ update your API calls accordingly. - [Tidy](#tidy) - [Configure Automatic Tidy](#configure-automatic-tidy) - [Tidy Status](#tidy-status) + - [Cancel Tidy](#cancel-tidy) - [Cluster Scalability](#cluster-scalability) - [Managed Key](#managed-keys) (Enterprise Only) - [Vault CLI with DER/PEM responses](#vault-cli-with-der-pem-responses) @@ -3197,6 +3198,18 @@ expiration time. the time must be after the expiration time of the certificate (according to the local clock) plus the duration of `safety_buffer`. Defaults to `72h`. +- `pause_duration` `(string: "0s")` - Specifies the duration to pause + between tidying individual certificates. This releases the revocation + lock and allows other operations to continue while tidy is running. + This allows an operator to control tidy's resource utilization within + a timespan: the LIST operation will remain in memory, but the space + between reading, parsing, and updates on-disk cert entries will be + increased, decreasing resource utilization. + +~> Note: Using too long of a `pause_duration` can result in tidy operations + not concluding during this lifetime! Using too short of a pause duration + (but non-zero) can lead to lock contention. Use [tidy's cancellation](#cancel-tidy) + to stop a running operation after the sleep period is over. #### Sample Payload @@ -3325,6 +3338,46 @@ $ curl \ }, ``` +### Cancel Tidy + +This endpoint allows cancelling a running tidy operation. It takes no +parameter and cancels the tidy at the next available checkpoint, which +may process additional certificates between when the operation was +marked as cancelled and when the operation stopped. + +The response to this endpoint is the same as the [status](#tidy-status). + +| Method | Path | +| :----- | :----------------- | +| `POST` | `/pki/tidy-cancel` | + +#### Sample Request + +```shell-session +$ curl \ + --header "X-Vault-Token: ..." \ + --request POST \ + http://127.0.0.1:8200/v1/pki/tidy-cancel + +``` + +#### Sample Response + +```json + "data": { + "safety_buffer": 60, + "tidy_cert_store": true, + "tidy_revoked_certs": true, + "error": null, + "message": "Tidying certificate store: checking entry 234 of 488", + "revoked_cert_deleted_count": 0, + "cert_store_deleted_count": 2, + "state": "Cancelling", + "time_started": "2021-10-20T14:52:13.510161-04:00", + "time_finished": null + }, +``` + --- ## Cluster Scalability