From 14adb3b825004af9b5bc530e242527239fa37d3f Mon Sep 17 00:00:00 2001 From: Kit Haines Date: Fri, 10 Feb 2023 16:31:56 -0500 Subject: [PATCH] Telemetry Metrics Configuration. (#18186) * Telemetry Metrics Configuration. * Err Shadowing Fix (woah, semgrep is cool). * Fix TestBackend_RevokePlusTidy_Intermediate * Add Changelog. * Fix memory leak. Code cleanup as suggested by Steve. * Turn off metrics by default, breaking-change. * Show on tidy-status before start-up. * Fix tests * make fmt * Add emit metrics to periodicFunc * Test not delivering unavailable metrics + fix. * Better error message. * Fixing the false-error bug. * make fmt. * Try to fix race issue, remove confusing comments. * Switch metric counter variables to an atomic.Uint32 - Switch the metric counter variables to an atomic variable type so that we are forced to properly load/store values to it * Fix race-issue better by trying until the metric is sunk. * make fmt. * empty commit to retrigger non-race tests that all pass locally --------- Co-authored-by: Steve Clark --- builtin/logical/pki/backend.go | 165 ++++++++---- builtin/logical/pki/backend_test.go | 49 +++- builtin/logical/pki/cert_util.go | 4 +- builtin/logical/pki/crl_util.go | 2 +- builtin/logical/pki/fields.go | 17 ++ builtin/logical/pki/path_issue_sign.go | 2 +- builtin/logical/pki/path_root.go | 4 +- builtin/logical/pki/path_tidy.go | 70 +++-- builtin/logical/pki/path_tidy_test.go | 341 +++++++++++++++++++++++++ builtin/logical/pki/storage.go | 28 +- changelog/18186.txt | 6 + 11 files changed, 600 insertions(+), 88 deletions(-) create mode 100644 changelog/18186.txt diff --git a/builtin/logical/pki/backend.go b/builtin/logical/pki/backend.go index e9f509025..1b093d03c 100644 --- a/builtin/logical/pki/backend.go +++ b/builtin/logical/pki/backend.go @@ -9,12 +9,12 @@ import ( "sync/atomic" "time" + atomic2 "go.uber.org/atomic" + "github.com/hashicorp/vault/helper/constants" "github.com/hashicorp/go-multierror" - atomic2 "go.uber.org/atomic" - "github.com/hashicorp/vault/sdk/helper/consts" "github.com/armon/go-metrics" @@ -244,9 +244,12 @@ func Backend(conf *logical.BackendConfig) *backend { b.lastTidy = time.Now() // Metrics initialization for count of certificates in storage + b.certCountEnabled = atomic2.NewBool(false) + b.publishCertCountMetrics = atomic2.NewBool(false) b.certsCounted = atomic2.NewBool(false) - b.certCount = new(uint32) - b.revokedCertCount = new(uint32) + b.certCountError = "Initialize Not Yet Run, Cert Counts Unavailable" + b.certCount = &atomic.Uint32{} + b.revokedCertCount = &atomic.Uint32{} b.possibleDoubleCountedSerials = make([]string, 0, 250) b.possibleDoubleCountedRevokedSerials = make([]string, 0, 250) @@ -270,9 +273,12 @@ type backend struct { unifiedTransferStatus *unifiedTransferStatus - certCount *uint32 - revokedCertCount *uint32 + certCountEnabled *atomic2.Bool + publishCertCountMetrics *atomic2.Bool + certCount *atomic.Uint32 + revokedCertCount *atomic.Uint32 certsCounted *atomic2.Bool + certCountError string possibleDoubleCountedSerials []string possibleDoubleCountedRevokedSerials []string @@ -364,9 +370,10 @@ func (b *backend) initialize(ctx context.Context, _ *logical.InitializationReque // Initialize also needs to populate our certificate and revoked certificate count err = b.initializeStoredCertificateCounts(ctx) if err != nil { - return err + // Don't block/err initialize/startup for metrics. Context on this call can time out due to number of certificates. + b.Logger().Error("Could not initialize stored certificate counts", err) + b.certCountError = err.Error() } - return nil } @@ -601,6 +608,13 @@ func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) er crlErr := doCRL() tidyErr := doAutoTidy() + // Periodically re-emit gauges so that they don't disappear/go stale + tidyConfig, err := sc.getAutoTidyConfig() + if err != nil { + return err + } + b.emitCertStoreMetrics(tidyConfig) + var errors error if crlErr != nil { errors = multierror.Append(errors, fmt.Errorf("Error building CRLs:\n - %w\n", crlErr)) @@ -625,24 +639,51 @@ func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) er } func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error { - b.tidyStatusLock.RLock() - defer b.tidyStatusLock.RUnlock() // For performance reasons, we can't lock on issuance/storage of certs until a list operation completes, // but we want to limit possible miscounts / double-counts to over-counting, so we take the tidy lock which // prevents (most) deletions - in particular we take a read lock (sufficient to block the write lock in // tidyStatusStart while allowing tidy to still acquire a read lock to report via its endpoint) + b.tidyStatusLock.RLock() + defer b.tidyStatusLock.RUnlock() + sc := b.makeStorageContext(ctx, b.storage) + config, err := sc.getAutoTidyConfig() + if err != nil { + return err + } + + b.certCountEnabled.Store(config.MaintainCount) + b.publishCertCountMetrics.Store(config.PublishMetrics) + + if config.MaintainCount == false { + b.possibleDoubleCountedRevokedSerials = nil + b.possibleDoubleCountedSerials = nil + b.certsCounted.Store(true) + b.certCount.Store(0) + b.revokedCertCount.Store(0) + b.certCountError = "Cert Count is Disabled: enable via Tidy Config maintain_stored_certificate_counts" + return nil + } + + // Ideally these three things would be set in one transaction, since that isn't possible, set the counts to "0", + // first, so count will over-count (and miss putting things in deduplicate queue), rather than under-count. + b.certCount.Store(0) + b.revokedCertCount.Store(0) + b.possibleDoubleCountedRevokedSerials = nil + b.possibleDoubleCountedSerials = nil + // A cert issued or revoked here will be double-counted. That's okay, this is "best effort" metrics. + b.certsCounted.Store(false) entries, err := b.storage.List(ctx, "certs/") if err != nil { return err } - atomic.AddUint32(b.certCount, uint32(len(entries))) + b.certCount.Add(uint32(len(entries))) revokedEntries, err := b.storage.List(ctx, "revoked/") if err != nil { return err } - atomic.AddUint32(b.revokedCertCount, uint32(len(revokedEntries))) + b.revokedCertCount.Add(uint32(len(revokedEntries))) b.certsCounted.Store(true) // Now that the metrics are set, we can switch from appending newly-stored certificates to the possible double-count @@ -723,64 +764,98 @@ func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error { b.possibleDoubleCountedRevokedSerials = nil b.possibleDoubleCountedSerials = nil - certCount := atomic.LoadUint32(b.certCount) - metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount)) - revokedCertCount := atomic.LoadUint32(b.revokedCertCount) - metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount)) + b.emitCertStoreMetrics(config) + + b.certCountError = "" return nil } -// The "certsCounted" boolean here should be loaded from the backend certsCounted before the corresponding storage call: -// eg. certsCounted := b.certsCounted.Load() -func (b *backend) incrementTotalCertificatesCount(certsCounted bool, newSerial string) { - certCount := atomic.AddUint32(b.certCount, 1) - switch { - case !certsCounted: - // This is unsafe, but a good best-attempt - if strings.HasPrefix(newSerial, "certs/") { - newSerial = newSerial[6:] - } - b.possibleDoubleCountedSerials = append(b.possibleDoubleCountedSerials, newSerial) - default: - metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount)) +func (b *backend) emitCertStoreMetrics(config *tidyConfig) { + if config.PublishMetrics == true { + certCount := b.certCount.Load() + b.emitTotalCertCountMetric(certCount) + revokedCertCount := b.revokedCertCount.Load() + b.emitTotalRevokedCountMetric(revokedCertCount) } } -func (b *backend) decrementTotalCertificatesCountReport() { - certCount := b.decrementTotalCertificatesCountNoReport() +// The "certsCounted" boolean here should be loaded from the backend certsCounted before the corresponding storage call: +// eg. certsCounted := b.certsCounted.Load() +func (b *backend) ifCountEnabledIncrementTotalCertificatesCount(certsCounted bool, newSerial string) { + if b.certCountEnabled.Load() { + certCount := b.certCount.Add(1) + switch { + case !certsCounted: + // This is unsafe, but a good best-attempt + if strings.HasPrefix(newSerial, "certs/") { + newSerial = newSerial[6:] + } + b.possibleDoubleCountedSerials = append(b.possibleDoubleCountedSerials, newSerial) + default: + if b.publishCertCountMetrics.Load() { + b.emitTotalCertCountMetric(certCount) + } + } + } +} + +func (b *backend) ifCountEnabledDecrementTotalCertificatesCountReport() { + if b.certCountEnabled.Load() { + certCount := b.decrementTotalCertificatesCountNoReport() + if b.publishCertCountMetrics.Load() { + b.emitTotalCertCountMetric(certCount) + } + } +} + +func (b *backend) emitTotalCertCountMetric(certCount uint32) { metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount)) } // Called directly only by the initialize function to deduplicate the count, when we don't have a full count yet +// Does not respect whether-we-are-counting backend information. func (b *backend) decrementTotalCertificatesCountNoReport() uint32 { - newCount := atomic.AddUint32(b.certCount, ^uint32(0)) + newCount := b.certCount.Add(^uint32(0)) return newCount } // The "certsCounted" boolean here should be loaded from the backend certsCounted before the corresponding storage call: // eg. certsCounted := b.certsCounted.Load() -func (b *backend) incrementTotalRevokedCertificatesCount(certsCounted bool, newSerial string) { - newRevokedCertCount := atomic.AddUint32(b.revokedCertCount, 1) - switch { - case !certsCounted: - // This is unsafe, but a good best-attempt - if strings.HasPrefix(newSerial, "revoked/") { // allow passing in the path (revoked/serial) OR the serial - newSerial = newSerial[8:] +func (b *backend) ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted bool, newSerial string) { + if b.certCountEnabled.Load() { + newRevokedCertCount := b.revokedCertCount.Add(1) + switch { + case !certsCounted: + // This is unsafe, but a good best-attempt + if strings.HasPrefix(newSerial, "revoked/") { // allow passing in the path (revoked/serial) OR the serial + newSerial = newSerial[8:] + } + b.possibleDoubleCountedRevokedSerials = append(b.possibleDoubleCountedRevokedSerials, newSerial) + default: + if b.publishCertCountMetrics.Load() { + b.emitTotalRevokedCountMetric(newRevokedCertCount) + } } - b.possibleDoubleCountedRevokedSerials = append(b.possibleDoubleCountedRevokedSerials, newSerial) - default: - metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(newRevokedCertCount)) } } -func (b *backend) decrementTotalRevokedCertificatesCountReport() { - revokedCertCount := b.decrementTotalRevokedCertificatesCountNoReport() +func (b *backend) ifCountEnabledDecrementTotalRevokedCertificatesCountReport() { + if b.certCountEnabled.Load() { + revokedCertCount := b.decrementTotalRevokedCertificatesCountNoReport() + if b.publishCertCountMetrics.Load() { + b.emitTotalRevokedCountMetric(revokedCertCount) + } + } +} + +func (b *backend) emitTotalRevokedCountMetric(revokedCertCount uint32) { metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount)) } // Called directly only by the initialize function to deduplicate the count, when we don't have a full count yet +// Does not respect whether-we-are-counting backend information. func (b *backend) decrementTotalRevokedCertificatesCountNoReport() uint32 { - newRevokedCertCount := atomic.AddUint32(b.revokedCertCount, ^uint32(0)) + newRevokedCertCount := b.revokedCertCount.Add(^uint32(0)) return newRevokedCertCount } diff --git a/builtin/logical/pki/backend_test.go b/builtin/logical/pki/backend_test.go index 51f986b53..6e07ccfee 100644 --- a/builtin/logical/pki/backend_test.go +++ b/builtin/logical/pki/backend_test.go @@ -27,7 +27,6 @@ import ( "strconv" "strings" "sync" - "sync/atomic" "testing" "time" @@ -3797,6 +3796,15 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) { t.Fatal(err) } + // Set up Metric Configuration, then restart to enable it + _, err = client.Logical().Write("pki/config/auto-tidy", map[string]interface{}{ + "maintain_stored_certificate_counts": true, + "publish_stored_certificate_count_metrics": true, + }) + _, err = client.Logical().Write("/sys/plugins/reload/backend", map[string]interface{}{ + "mounts": "pki/", + }) + // Check the metrics initialized in order to calculate backendUUID for /pki // BackendUUID not consistent during tests with UUID from /sys/mounts/pki metricsSuffix := "total_certificates_stored" @@ -3833,6 +3841,14 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) { if err != nil { t.Fatal(err) } + // Set up Metric Configuration, then restart to enable it + _, err = client.Logical().Write("pki2/config/auto-tidy", map[string]interface{}{ + "maintain_stored_certificate_counts": true, + "publish_stored_certificate_count_metrics": true, + }) + _, err = client.Logical().Write("/sys/plugins/reload/backend", map[string]interface{}{ + "mounts": "pki2/", + }) // Create a CSR for the intermediate CA secret, err := client.Logical().Write("pki2/intermediate/generate/internal", nil) @@ -3961,6 +3977,7 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) { "current_revoked_cert_count": json.Number("0"), "revocation_queue_deleted_count": json.Number("0"), "cross_revoked_cert_deleted_count": json.Number("0"), + "internal_backend_uuid": backendUUID, } // Let's copy the times from the response so that we can use deep.Equal() timeStarted, ok := tidyStatus.Data["time_started"] @@ -5590,6 +5607,14 @@ func TestBackend_InitializeCertificateCounts(t *testing.T) { serials[i] = resp.Data["serial_number"].(string) } + // Turn on certificate counting: + CBWrite(b, s, "config/auto-tidy", map[string]interface{}{ + "maintain_stored_certificate_counts": true, + "publish_stored_certificate_count_metrics": false, + }) + // Assert initialize from clean is correct: + b.initializeStoredCertificateCounts(ctx) + // Revoke certificates A + B revocations := serials[0:2] for _, key := range revocations { @@ -5601,18 +5626,16 @@ func TestBackend_InitializeCertificateCounts(t *testing.T) { } } - // Assert initialize from clean is correct: - b.initializeStoredCertificateCounts(ctx) - if atomic.LoadUint32(b.certCount) != 6 { - t.Fatalf("Failed to count six certificates root,A,B,C,D,E, instead counted %d certs", atomic.LoadUint32(b.certCount)) + if b.certCount.Load() != 6 { + t.Fatalf("Failed to count six certificates root,A,B,C,D,E, instead counted %d certs", b.certCount.Load()) } - if atomic.LoadUint32(b.revokedCertCount) != 2 { - t.Fatalf("Failed to count two revoked certificates A+B, instead counted %d certs", atomic.LoadUint32(b.revokedCertCount)) + if b.revokedCertCount.Load() != 2 { + t.Fatalf("Failed to count two revoked certificates A+B, instead counted %d certs", b.revokedCertCount.Load()) } // Simulates listing while initialize in progress, by "restarting it" - atomic.StoreUint32(b.certCount, 0) - atomic.StoreUint32(b.revokedCertCount, 0) + b.certCount.Store(0) + b.revokedCertCount.Store(0) b.certsCounted.Store(false) // Revoke certificates C, D @@ -5641,12 +5664,12 @@ func TestBackend_InitializeCertificateCounts(t *testing.T) { b.initializeStoredCertificateCounts(ctx) // Test certificate count - if atomic.LoadUint32(b.certCount) != 8 { - t.Fatalf("Failed to initialize count of certificates root, A,B,C,D,E,F,G counted %d certs", *(b.certCount)) + if b.certCount.Load() != 8 { + t.Fatalf("Failed to initialize count of certificates root, A,B,C,D,E,F,G counted %d certs", b.certCount.Load()) } - if atomic.LoadUint32(b.revokedCertCount) != 4 { - t.Fatalf("Failed to count revoked certificates A,B,C,D counted %d certs", *(b.revokedCertCount)) + if b.revokedCertCount.Load() != 4 { + t.Fatalf("Failed to count revoked certificates A,B,C,D counted %d certs", b.revokedCertCount.Load()) } return diff --git a/builtin/logical/pki/cert_util.go b/builtin/logical/pki/cert_util.go index 45f315c94..5845c8085 100644 --- a/builtin/logical/pki/cert_util.go +++ b/builtin/logical/pki/cert_util.go @@ -234,9 +234,9 @@ func fetchCertBySerial(sc *storageContext, prefix, serial string) (*logical.Stor // If we fail here, we have an extra (copy) of a cert in storage, add to metrics: switch { case strings.HasPrefix(prefix, "revoked/"): - sc.Backend.incrementTotalRevokedCertificatesCount(certsCounted, path) + sc.Backend.ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted, path) default: - sc.Backend.incrementTotalCertificatesCount(certsCounted, path) + sc.Backend.ifCountEnabledIncrementTotalCertificatesCount(certsCounted, path) } return nil, errutil.InternalError{Err: fmt.Sprintf("error deleting certificate with serial %s from old location", serial)} } diff --git a/builtin/logical/pki/crl_util.go b/builtin/logical/pki/crl_util.go index 7cd975f75..cb41eb617 100644 --- a/builtin/logical/pki/crl_util.go +++ b/builtin/logical/pki/crl_util.go @@ -978,7 +978,7 @@ func revokeCert(sc *storageContext, config *crlConfig, cert *x509.Certificate) ( if err != nil { return nil, fmt.Errorf("error saving revoked certificate to new location") } - sc.Backend.incrementTotalRevokedCertificatesCount(certsCounted, revEntry.Key) + sc.Backend.ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted, revEntry.Key) // If this flag is enabled after the fact, existing local entries will be published to // the unified storage space through a periodic function. diff --git a/builtin/logical/pki/fields.go b/builtin/logical/pki/fields.go index e806f473a..722d7bfc9 100644 --- a/builtin/logical/pki/fields.go +++ b/builtin/logical/pki/fields.go @@ -518,6 +518,23 @@ greater period of time. By default this is zero seconds.`, Default: "0s", } + fields["maintain_stored_certificate_counts"] = &framework.FieldSchema{ + Type: framework.TypeBool, + Description: `This configures whether stored certificates +are counted upon initialization of the backend, and whether during +normal operation, a running count of certificates stored is maintained.`, + Default: false, + } + + fields["publish_stored_certificate_count_metrics"] = &framework.FieldSchema{ + Type: framework.TypeBool, + Description: `This configures whether the stored certificate +count is published to the metrics consumer. It does not affect if the +stored certificate count is maintained, and if maintained, it will be +available on the tidy-status endpoint.`, + Default: false, + } + fields["tidy_revocation_queue"] = &framework.FieldSchema{ Type: framework.TypeBool, Description: `Set to true to remove stale revocation queue entries diff --git a/builtin/logical/pki/path_issue_sign.go b/builtin/logical/pki/path_issue_sign.go index ccb1251b6..c8bc0088c 100644 --- a/builtin/logical/pki/path_issue_sign.go +++ b/builtin/logical/pki/path_issue_sign.go @@ -419,7 +419,7 @@ func (b *backend) pathIssueSignCert(ctx context.Context, req *logical.Request, d if err != nil { return nil, fmt.Errorf("unable to store certificate locally: %w", err) } - b.incrementTotalCertificatesCount(certsCounted, key) + b.ifCountEnabledIncrementTotalCertificatesCount(certsCounted, key) } if useCSR { diff --git a/builtin/logical/pki/path_root.go b/builtin/logical/pki/path_root.go index 03aa3a3e6..0eb33434d 100644 --- a/builtin/logical/pki/path_root.go +++ b/builtin/logical/pki/path_root.go @@ -271,7 +271,7 @@ func (b *backend) pathCAGenerateRoot(ctx context.Context, req *logical.Request, if err != nil { return nil, fmt.Errorf("unable to store certificate locally: %w", err) } - b.incrementTotalCertificatesCount(certsCounted, key) + b.ifCountEnabledIncrementTotalCertificatesCount(certsCounted, key) // Build a fresh CRL err = b.crlBuilder.rebuild(sc, true) @@ -468,7 +468,7 @@ func (b *backend) pathIssuerSignIntermediate(ctx context.Context, req *logical.R if err != nil { return nil, fmt.Errorf("unable to store certificate locally: %w", err) } - b.incrementTotalCertificatesCount(certsCounted, key) + b.ifCountEnabledIncrementTotalCertificatesCount(certsCounted, key) if parsedBundle.Certificate.MaxPathLen == 0 { resp.AddWarning("Max path length of the signed certificate is zero. This certificate cannot be used to issue intermediate CA certificates.") diff --git a/builtin/logical/pki/path_tidy.go b/builtin/logical/pki/path_tidy.go index 8bd22da10..4c0519c00 100644 --- a/builtin/logical/pki/path_tidy.go +++ b/builtin/logical/pki/path_tidy.go @@ -70,6 +70,8 @@ type tidyConfig struct { SafetyBuffer time.Duration `json:"safety_buffer"` IssuerSafetyBuffer time.Duration `json:"issuer_safety_buffer"` PauseDuration time.Duration `json:"pause_duration"` + MaintainCount bool `json:"maintain_stored_certificate_counts"` + PublishMetrics bool `json:"publish_stored_certificate_count_metrics"` RevocationQueue bool `json:"tidy_revocation_queue"` QueueSafetyBuffer time.Duration `json:"revocation_queue_safety_buffer"` CrossRevokedCerts bool `json:"tidy_cross_cluster_revoked_certs"` @@ -86,6 +88,8 @@ var defaultTidyConfig = tidyConfig{ SafetyBuffer: 72 * time.Hour, IssuerSafetyBuffer: 365 * 24 * time.Hour, PauseDuration: 0 * time.Second, + MaintainCount: false, + PublishMetrics: false, RevocationQueue: false, QueueSafetyBuffer: 48 * time.Hour, CrossRevokedCerts: false, @@ -973,11 +977,26 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f "missing_issuer_cert_count": nil, "current_cert_store_count": nil, "current_revoked_cert_count": nil, + "internal_backend_uuid": nil, "revocation_queue_deleted_count": nil, "cross_revoked_cert_deleted_count": nil, }, } + resp.Data["internal_backend_uuid"] = b.backendUUID + + if b.certCountEnabled.Load() { + resp.Data["current_cert_store_count"] = b.certCount.Load() + resp.Data["current_revoked_cert_count"] = b.revokedCertCount.Load() + if !b.certsCounted.Load() { + resp.AddWarning("Certificates in storage are still being counted, current counts provided may be " + + "inaccurate") + } + if b.certCountError != "" { + resp.Data["certificate_counting_error"] = b.certCountError + } + } + if b.tidyStatus.state == tidyStatusInactive { return resp, nil } @@ -1020,14 +1039,6 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f resp.Data["time_finished"] = b.tidyStatus.timeFinished } - resp.Data["current_cert_store_count"] = atomic.LoadUint32(b.certCount) - resp.Data["current_revoked_cert_count"] = atomic.LoadUint32(b.revokedCertCount) - - if !b.certsCounted.Load() { - resp.AddWarning("Certificates in storage are still being counted, current counts provided may be " + - "inaccurate") - } - return resp, nil } @@ -1040,19 +1051,21 @@ func (b *backend) pathConfigAutoTidyRead(ctx context.Context, req *logical.Reque return &logical.Response{ Data: map[string]interface{}{ - "enabled": config.Enabled, - "interval_duration": int(config.Interval / time.Second), - "tidy_cert_store": config.CertStore, - "tidy_revoked_certs": config.RevokedCerts, - "tidy_revoked_cert_issuer_associations": config.IssuerAssocs, - "tidy_expired_issuers": config.ExpiredIssuers, - "tidy_move_legacy_ca_bundle": config.BackupBundle, - "safety_buffer": int(config.SafetyBuffer / time.Second), - "issuer_safety_buffer": int(config.IssuerSafetyBuffer / time.Second), - "pause_duration": config.PauseDuration.String(), - "tidy_revocation_queue": config.RevocationQueue, - "revocation_queue_safety_buffer": int(config.QueueSafetyBuffer / time.Second), - "tidy_cross_cluster_revoked_certs": config.CrossRevokedCerts, + "enabled": config.Enabled, + "interval_duration": int(config.Interval / time.Second), + "tidy_cert_store": config.CertStore, + "tidy_revoked_certs": config.RevokedCerts, + "tidy_revoked_cert_issuer_associations": config.IssuerAssocs, + "tidy_expired_issuers": config.ExpiredIssuers, + "safety_buffer": int(config.SafetyBuffer / time.Second), + "issuer_safety_buffer": int(config.IssuerSafetyBuffer / time.Second), + "pause_duration": config.PauseDuration.String(), + "publish_stored_certificate_count_metrics": config.PublishMetrics, + "maintain_stored_certificate_counts": config.MaintainCount, + "tidy_move_legacy_ca_bundle": config.BackupBundle, + "tidy_revocation_queue": config.RevocationQueue, + "revocation_queue_safety_buffer": int(config.QueueSafetyBuffer / time.Second), + "tidy_cross_cluster_revoked_certs": config.CrossRevokedCerts, }, }, nil } @@ -1139,6 +1152,17 @@ func (b *backend) pathConfigAutoTidyWrite(ctx context.Context, req *logical.Requ return logical.ErrorResponse("Auto-tidy enabled but no tidy operations were requested. Enable at least one tidy operation to be run (tidy_cert_store / tidy_revoked_certs / tidy_revoked_cert_issuer_associations / tidy_expired_issuers / tidy_move_legacy_ca_bundle / tidy_revocation_queue / tidy_cross_cluster_revoked_certs)."), nil } + if maintainCountEnabledRaw, ok := d.GetOk("maintain_stored_certificate_counts"); ok { + config.MaintainCount = maintainCountEnabledRaw.(bool) + } + + if runningStorageMetricsEnabledRaw, ok := d.GetOk("publish_stored_certificate_count_metrics"); ok { + if config.MaintainCount == false { + return logical.ErrorResponse("Can not publish a running storage metrics count to metrics without first maintaining that count. Enable `maintain_stored_certificate_counts` to enable `publish_stored_certificate_count_metrics."), nil + } + config.PublishMetrics = runningStorageMetricsEnabledRaw.(bool) + } + if err := sc.writeAutoTidyConfig(config); err != nil { return nil, err } @@ -1224,7 +1248,7 @@ func (b *backend) tidyStatusIncCertStoreCount() { b.tidyStatus.certStoreDeletedCount++ - b.decrementTotalCertificatesCountReport() + b.ifCountEnabledDecrementTotalCertificatesCountReport() } func (b *backend) tidyStatusIncRevokedCertCount() { @@ -1233,7 +1257,7 @@ func (b *backend) tidyStatusIncRevokedCertCount() { b.tidyStatus.revokedCertDeletedCount++ - b.decrementTotalRevokedCertificatesCountReport() + b.ifCountEnabledDecrementTotalRevokedCertificatesCountReport() } func (b *backend) tidyStatusIncMissingIssuerCertCount() { diff --git a/builtin/logical/pki/path_tidy_test.go b/builtin/logical/pki/path_tidy_test.go index 45d3d3a6a..29292fbd3 100644 --- a/builtin/logical/pki/path_tidy_test.go +++ b/builtin/logical/pki/path_tidy_test.go @@ -2,9 +2,15 @@ package pki import ( "encoding/json" + "errors" + "fmt" "testing" "time" + "github.com/hashicorp/vault/helper/testhelpers" + + "github.com/armon/go-metrics" + "github.com/hashicorp/vault/api" vaulthttp "github.com/hashicorp/vault/http" "github.com/hashicorp/vault/sdk/logical" @@ -405,3 +411,338 @@ func TestTidyIssuerConfig(t *testing.T) { require.Equal(t, true, resp.Data["tidy_expired_issuers"]) require.Equal(t, 5, resp.Data["issuer_safety_buffer"]) } + +// TestCertStorageMetrics ensures that when enabled, metrics are able to count the number of certificates in storage and +// number of revoked certificates in storage. Moreover, this test ensures that the gauge is emitted periodically, so +// that the metric does not disappear or go stale. +func TestCertStorageMetrics(t *testing.T) { + // This tests uses the same setup as TestAutoTidy + newPeriod := 1 * time.Second + + // We set up a metrics accumulator + inmemSink := metrics.NewInmemSink( + 2*newPeriod, // A short time period is ideal here to test metrics are emitted every periodic func + 2000000*time.Hour) + + metricsConf := metrics.DefaultConfig("") + metricsConf.EnableHostname = false + metricsConf.EnableHostnameLabel = false + metricsConf.EnableServiceLabel = false + metricsConf.EnableTypePrefix = false + + _, err := metrics.NewGlobal(metricsConf, inmemSink) + if err != nil { + t.Fatal(err) + } + + // This test requires the periodicFunc to trigger, which requires we stand + // up a full test cluster. + coreConfig := &vault.CoreConfig{ + LogicalBackends: map[string]logical.Factory{ + "pki": Factory, + }, + // See notes below about usage of /sys/raw for reading cluster + // storage without barrier encryption. + EnableRaw: true, + RollbackPeriod: newPeriod, + } + cluster := vault.NewTestCluster(t, coreConfig, &vault.TestClusterOptions{ + HandlerFunc: vaulthttp.Handler, + }) + cluster.Start() + defer cluster.Cleanup() + client := cluster.Cores[0].Client + + // Mount PKI + err = client.Sys().Mount("pki", &api.MountInput{ + Type: "pki", + Config: api.MountConfigInput{ + DefaultLeaseTTL: "10m", + MaxLeaseTTL: "60m", + }, + }) + require.NoError(t, err) + + // Generate root. + resp, err := client.Logical().Write("pki/root/generate/internal", map[string]interface{}{ + "ttl": "40h", + "common_name": "Root X1", + "key_type": "ec", + }) + require.NoError(t, err) + require.NotNil(t, resp) + require.NotEmpty(t, resp.Data) + require.NotEmpty(t, resp.Data["issuer_id"]) + + // Set up a testing role. + _, err = client.Logical().Write("pki/roles/local-testing", map[string]interface{}{ + "allow_any_name": true, + "enforce_hostnames": false, + "key_type": "ec", + }) + require.NoError(t, err) + + // Run tidy so that tidy-status is not empty + _, err = client.Logical().Write("pki/tidy", map[string]interface{}{ + "tidy_revoked_certs": true, + }) + require.NoError(t, err) + + // Since certificate counts are off by default, we shouldn't see counts in the tidy status + tidyStatus, err := client.Logical().Read("pki/tidy-status") + if err != nil { + t.Fatal(err) + } + // backendUUID should exist, we need this for metrics + backendUUID := tidyStatus.Data["internal_backend_uuid"].(string) + // "current_cert_store_count", "current_revoked_cert_count" + countData, ok := tidyStatus.Data["current_cert_store_count"] + if ok && countData != nil { + t.Fatalf("Certificate counting should be off by default, but current cert store count %v appeared in tidy status in unconfigured mount", countData) + } + revokedCountData, ok := tidyStatus.Data["current_revoked_cert_count"] + if ok && revokedCountData != nil { + t.Fatalf("Certificate counting should be off by default, but revoked cert count %v appeared in tidy status in unconfigured mount", revokedCountData) + } + + // Since certificate counts are off by default, those metrics should not exist yet + mostRecentInterval := inmemSink.Data()[len(inmemSink.Data())-1] + _, ok = mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"] + if ok { + t.Fatalf("Certificate counting should be off by default, but revoked cert count was emitted as a metric in an unconfigured mount") + } + _, ok = mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_certificates_stored"] + if ok { + t.Fatalf("Certificate counting should be off by default, but total certificate count was emitted as a metric in an unconfigured mount") + } + + // Write the auto-tidy config. + _, err = client.Logical().Write("pki/config/auto-tidy", map[string]interface{}{ + "enabled": true, + "interval_duration": "1s", + "tidy_cert_store": true, + "tidy_revoked_certs": true, + "safety_buffer": "1s", + "maintain_stored_certificate_counts": true, + "publish_stored_certificate_count_metrics": false, + }) + require.NoError(t, err) + + // Reload the Mount - Otherwise Stored Certificate Counts Will Not Be Populated + _, err = client.Logical().Write("/sys/plugins/reload/backend", map[string]interface{}{ + "plugin": "pki", + }) + + // By reading the auto-tidy endpoint, we ensure that initialize has completed (which has a write lock on auto-tidy) + _, err = client.Logical().Read("/pki/config/auto-tidy") + if err != nil { + t.Fatal(err) + } + + // Since publish_stored_certificate_count_metrics is still false, these metrics should still not exist yet + mostRecentInterval = inmemSink.Data()[len(inmemSink.Data())-1] + _, ok = mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"] + if ok { + t.Fatalf("Certificate counting should be off by default, but revoked cert count was emitted as a metric in an unconfigured mount") + } + _, ok = mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_certificates_stored"] + if ok { + t.Fatalf("Certificate counting should be off by default, but total certificate count was emitted as a metric in an unconfigured mount") + } + + // But since certificate counting is on, the metrics should exist on tidyStatus endpoint: + tidyStatus, err = client.Logical().Read("pki/tidy-status") + if err != nil { + t.Fatal(err) + } + // backendUUID should exist, we need this for metrics + backendUUID = tidyStatus.Data["internal_backend_uuid"].(string) + // "current_cert_store_count", "current_revoked_cert_count" + certStoreCount, ok := tidyStatus.Data["current_cert_store_count"] + if !ok { + t.Fatalf("Certificate counting has been turned on, but current cert store count does not appear in tidy status") + } + if certStoreCount != json.Number("1") { + t.Fatalf("Only created one certificate, but a got a certificate count of %v", certStoreCount) + } + revokedCertCount, ok := tidyStatus.Data["current_revoked_cert_count"] + if !ok { + t.Fatalf("Certificate counting has been turned on, but revoked cert store count does not appear in tidy status") + } + if revokedCertCount != json.Number("0") { + t.Fatalf("Have not yet revoked a certificate, but got a revoked cert store count of %v", revokedCertCount) + } + + // Write the auto-tidy config, again, this time turning on metrics + _, err = client.Logical().Write("pki/config/auto-tidy", map[string]interface{}{ + "enabled": true, + "interval_duration": "1s", + "tidy_cert_store": true, + "tidy_revoked_certs": true, + "safety_buffer": "1s", + "maintain_stored_certificate_counts": true, + "publish_stored_certificate_count_metrics": true, + }) + require.NoError(t, err) + + // Issue a cert and revoke it. + resp, err = client.Logical().Write("pki/issue/local-testing", map[string]interface{}{ + "common_name": "example.com", + "ttl": "10s", + }) + require.NoError(t, err) + require.NotNil(t, resp) + require.NotNil(t, resp.Data) + require.NotEmpty(t, resp.Data["serial_number"]) + require.NotEmpty(t, resp.Data["certificate"]) + leafSerial := resp.Data["serial_number"].(string) + leafCert := parseCert(t, resp.Data["certificate"].(string)) + + // Read cert before revoking + resp, err = client.Logical().Read("pki/cert/" + leafSerial) + require.NoError(t, err) + require.NotNil(t, resp) + require.NotNil(t, resp.Data) + require.NotEmpty(t, resp.Data["certificate"]) + revocationTime, err := (resp.Data["revocation_time"].(json.Number)).Int64() + require.Equal(t, int64(0), revocationTime, "revocation time was not zero") + require.Empty(t, resp.Data["revocation_time_rfc3339"], "revocation_time_rfc3339 was not empty") + require.Empty(t, resp.Data["issuer_id"], "issuer_id was not empty") + + _, err = client.Logical().Write("pki/revoke", map[string]interface{}{ + "serial_number": leafSerial, + }) + require.NoError(t, err) + + // We read the auto-tidy endpoint again, to ensure any metrics logic has completed (lock on config) + _, err = client.Logical().Read("/pki/config/auto-tidy") + if err != nil { + t.Fatal(err) + } + + // Check Metrics After Cert Has Be Created and Revoked + tidyStatus, err = client.Logical().Read("pki/tidy-status") + if err != nil { + t.Fatal(err) + } + backendUUID = tidyStatus.Data["internal_backend_uuid"].(string) + certStoreCount, ok = tidyStatus.Data["current_cert_store_count"] + if !ok { + t.Fatalf("Certificate counting has been turned on, but current cert store count does not appear in tidy status") + } + if certStoreCount != json.Number("2") { + t.Fatalf("Created root and leaf certificate, but a got a certificate count of %v", certStoreCount) + } + revokedCertCount, ok = tidyStatus.Data["current_revoked_cert_count"] + if !ok { + t.Fatalf("Certificate counting has been turned on, but revoked cert store count does not appear in tidy status") + } + if revokedCertCount != json.Number("1") { + t.Fatalf("Revoked one certificate, but got a revoked cert store count of %v", revokedCertCount) + } + // This should now be initialized + certCountError, ok := tidyStatus.Data["certificate_counting_error"] + if ok && certCountError.(string) != "" { + t.Fatalf("Expected certificate count error to disappear after initialization, but got error %v", certCountError) + } + + testhelpers.RetryUntil(t, newPeriod*5, func() error { + mostRecentInterval = inmemSink.Data()[len(inmemSink.Data())-1] + revokedCertCountGaugeValue, ok := mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"] + if !ok { + return errors.New("turned on metrics, but revoked cert count was not emitted") + } + if revokedCertCountGaugeValue.Value != 1 { + return fmt.Errorf("revoked one certificate, but metrics emitted a revoked cert store count of %v", revokedCertCountGaugeValue) + } + certStoreCountGaugeValue, ok := mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_certificates_stored"] + if !ok { + return errors.New("turned on metrics, but total certificate count was not emitted") + } + if certStoreCountGaugeValue.Value != 2 { + return fmt.Errorf("stored two certificiates, but total certificate count emitted was %v", certStoreCountGaugeValue.Value) + } + return nil + }) + + // Wait for cert to expire and the safety buffer to elapse. + time.Sleep(time.Until(leafCert.NotAfter) + 3*time.Second) + + // Wait for auto-tidy to run afterwards. + var foundTidyRunning string + var foundTidyFinished bool + timeoutChan := time.After(120 * time.Second) + for { + if foundTidyRunning != "" && foundTidyFinished { + break + } + + select { + case <-timeoutChan: + t.Fatalf("expected auto-tidy to run (%v) and finish (%v) before 120 seconds elapsed", foundTidyRunning, foundTidyFinished) + default: + time.Sleep(250 * time.Millisecond) + + resp, err = client.Logical().Read("pki/tidy-status") + require.NoError(t, err) + require.NotNil(t, resp) + require.NotNil(t, resp.Data) + require.NotEmpty(t, resp.Data["state"]) + require.NotEmpty(t, resp.Data["time_started"]) + state := resp.Data["state"].(string) + started := resp.Data["time_started"].(string) + t.Logf("Resp: %v", resp.Data) + + // We want the _next_ tidy run after the cert expires. This + // means if we're currently finished when we hit this the + // first time, we want to wait for the next run. + if foundTidyRunning == "" { + foundTidyRunning = started + } else if foundTidyRunning != started && !foundTidyFinished && state == "Finished" { + foundTidyFinished = true + } + } + } + + // After Tidy, Cert Store Count Should Still Be Available, and Be Updated: + // Check Metrics After Cert Has Be Created and Revoked + tidyStatus, err = client.Logical().Read("pki/tidy-status") + if err != nil { + t.Fatal(err) + } + backendUUID = tidyStatus.Data["internal_backend_uuid"].(string) + // "current_cert_store_count", "current_revoked_cert_count" + certStoreCount, ok = tidyStatus.Data["current_cert_store_count"] + if !ok { + t.Fatalf("Certificate counting has been turned on, but current cert store count does not appear in tidy status") + } + if certStoreCount != json.Number("1") { + t.Fatalf("Created root and leaf certificate, deleted leaf, but a got a certificate count of %v", certStoreCount) + } + revokedCertCount, ok = tidyStatus.Data["current_revoked_cert_count"] + if !ok { + t.Fatalf("Certificate counting has been turned on, but revoked cert store count does not appear in tidy status") + } + if revokedCertCount != json.Number("0") { + t.Fatalf("Revoked certificate has been tidied, but got a revoked cert store count of %v", revokedCertCount) + } + + testhelpers.RetryUntil(t, newPeriod*5, func() error { + mostRecentInterval = inmemSink.Data()[len(inmemSink.Data())-1] + revokedCertCountGaugeValue, ok := mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"] + if !ok { + return errors.New("turned on metrics, but revoked cert count was not emitted") + } + if revokedCertCountGaugeValue.Value != 0 { + return fmt.Errorf("revoked certificate has been tidied, but metrics emitted a revoked cert store count of %v", revokedCertCountGaugeValue) + } + certStoreCountGaugeValue, ok := mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_certificates_stored"] + if !ok { + return errors.New("turned on metrics, but total certificate count was not emitted") + } + if certStoreCountGaugeValue.Value != 1 { + return fmt.Errorf("only one of two certificates left after tidy, but total certificate count emitted was %v", certStoreCountGaugeValue.Value) + } + return nil + }) +} diff --git a/builtin/logical/pki/storage.go b/builtin/logical/pki/storage.go index f098f5835..aee1ce7f1 100644 --- a/builtin/logical/pki/storage.go +++ b/builtin/logical/pki/storage.go @@ -1351,7 +1351,33 @@ func (sc *storageContext) writeAutoTidyConfig(config *tidyConfig) error { return err } - return sc.Storage.Put(sc.Context, entry) + err = sc.Storage.Put(sc.Context, entry) + if err != nil { + return err + } + + sc.Backend.publishCertCountMetrics.Store(config.PublishMetrics) + + // To Potentially Disable Certificate Counting + if config.MaintainCount == false { + certCountWasEnabled := sc.Backend.certCountEnabled.Swap(config.MaintainCount) + if certCountWasEnabled { + sc.Backend.certsCounted.Store(true) + sc.Backend.certCountError = "Cert Count is Disabled: enable via Tidy Config maintain_stored_certificate_counts" + sc.Backend.possibleDoubleCountedSerials = nil // This won't stop a list operation, but will stop an expensive clean-up during initialize + sc.Backend.possibleDoubleCountedRevokedSerials = nil // This won't stop a list operation, but will stop an expensive clean-up during initialize + sc.Backend.certCount.Store(0) + sc.Backend.revokedCertCount.Store(0) + } + } else { // To Potentially Enable Certificate Counting + if sc.Backend.certCountEnabled.Load() == false { + // We haven't written "re-enable certificate counts" outside the initialize function + // Any call derived call to do so is likely to time out on ~2 million certs + sc.Backend.certCountError = "Certificate Counting Has Not Been Initialized, re-initialize this mount" + } + } + + return nil } func (sc *storageContext) listRevokedCerts() ([]string, error) { diff --git a/changelog/18186.txt b/changelog/18186.txt new file mode 100644 index 000000000..137108262 --- /dev/null +++ b/changelog/18186.txt @@ -0,0 +1,6 @@ +```release-note:breaking-change +secrets/pki: Maintaining running count of certificates will be turned off by default. +To re-enable keeping these metrics available on the tidy status endpoint, enable +maintain_stored_certificate_counts on tidy-config, to also publish them to the +metrics consumer, enable publish_stored_certificate_count_metrics . +``` \ No newline at end of file