Telemetry Metrics Configuration. (#18186)

* Telemetry Metrics Configuration.

* Err Shadowing Fix (woah, semgrep is cool).

* Fix TestBackend_RevokePlusTidy_Intermediate

* Add Changelog.

* Fix memory leak.  Code cleanup as suggested by Steve.

* Turn off metrics by default, breaking-change.

* Show on tidy-status before start-up.

* Fix tests

* make fmt

* Add emit metrics to periodicFunc

* Test not delivering unavailable metrics + fix.

* Better error message.

* Fixing the false-error bug.

* make fmt.

* Try to fix race issue, remove confusing comments.

* Switch metric counter variables to an atomic.Uint32

 - Switch the metric counter variables to an atomic variable type
   so that we are forced to properly load/store values to it

* Fix race-issue better by trying until the metric is sunk.

* make fmt.

* empty commit to retrigger non-race tests that all pass locally

---------

Co-authored-by: Steve Clark <steven.clark@hashicorp.com>
This commit is contained in:
Kit Haines 2023-02-10 16:31:56 -05:00 committed by GitHub
parent 7dff0e6ae4
commit 14adb3b825
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 600 additions and 88 deletions

View File

@ -9,12 +9,12 @@ import (
"sync/atomic"
"time"
atomic2 "go.uber.org/atomic"
"github.com/hashicorp/vault/helper/constants"
"github.com/hashicorp/go-multierror"
atomic2 "go.uber.org/atomic"
"github.com/hashicorp/vault/sdk/helper/consts"
"github.com/armon/go-metrics"
@ -244,9 +244,12 @@ func Backend(conf *logical.BackendConfig) *backend {
b.lastTidy = time.Now()
// Metrics initialization for count of certificates in storage
b.certCountEnabled = atomic2.NewBool(false)
b.publishCertCountMetrics = atomic2.NewBool(false)
b.certsCounted = atomic2.NewBool(false)
b.certCount = new(uint32)
b.revokedCertCount = new(uint32)
b.certCountError = "Initialize Not Yet Run, Cert Counts Unavailable"
b.certCount = &atomic.Uint32{}
b.revokedCertCount = &atomic.Uint32{}
b.possibleDoubleCountedSerials = make([]string, 0, 250)
b.possibleDoubleCountedRevokedSerials = make([]string, 0, 250)
@ -270,9 +273,12 @@ type backend struct {
unifiedTransferStatus *unifiedTransferStatus
certCount *uint32
revokedCertCount *uint32
certCountEnabled *atomic2.Bool
publishCertCountMetrics *atomic2.Bool
certCount *atomic.Uint32
revokedCertCount *atomic.Uint32
certsCounted *atomic2.Bool
certCountError string
possibleDoubleCountedSerials []string
possibleDoubleCountedRevokedSerials []string
@ -364,9 +370,10 @@ func (b *backend) initialize(ctx context.Context, _ *logical.InitializationReque
// Initialize also needs to populate our certificate and revoked certificate count
err = b.initializeStoredCertificateCounts(ctx)
if err != nil {
return err
// Don't block/err initialize/startup for metrics. Context on this call can time out due to number of certificates.
b.Logger().Error("Could not initialize stored certificate counts", err)
b.certCountError = err.Error()
}
return nil
}
@ -601,6 +608,13 @@ func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) er
crlErr := doCRL()
tidyErr := doAutoTidy()
// Periodically re-emit gauges so that they don't disappear/go stale
tidyConfig, err := sc.getAutoTidyConfig()
if err != nil {
return err
}
b.emitCertStoreMetrics(tidyConfig)
var errors error
if crlErr != nil {
errors = multierror.Append(errors, fmt.Errorf("Error building CRLs:\n - %w\n", crlErr))
@ -625,24 +639,51 @@ func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) er
}
func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error {
b.tidyStatusLock.RLock()
defer b.tidyStatusLock.RUnlock()
// For performance reasons, we can't lock on issuance/storage of certs until a list operation completes,
// but we want to limit possible miscounts / double-counts to over-counting, so we take the tidy lock which
// prevents (most) deletions - in particular we take a read lock (sufficient to block the write lock in
// tidyStatusStart while allowing tidy to still acquire a read lock to report via its endpoint)
b.tidyStatusLock.RLock()
defer b.tidyStatusLock.RUnlock()
sc := b.makeStorageContext(ctx, b.storage)
config, err := sc.getAutoTidyConfig()
if err != nil {
return err
}
b.certCountEnabled.Store(config.MaintainCount)
b.publishCertCountMetrics.Store(config.PublishMetrics)
if config.MaintainCount == false {
b.possibleDoubleCountedRevokedSerials = nil
b.possibleDoubleCountedSerials = nil
b.certsCounted.Store(true)
b.certCount.Store(0)
b.revokedCertCount.Store(0)
b.certCountError = "Cert Count is Disabled: enable via Tidy Config maintain_stored_certificate_counts"
return nil
}
// Ideally these three things would be set in one transaction, since that isn't possible, set the counts to "0",
// first, so count will over-count (and miss putting things in deduplicate queue), rather than under-count.
b.certCount.Store(0)
b.revokedCertCount.Store(0)
b.possibleDoubleCountedRevokedSerials = nil
b.possibleDoubleCountedSerials = nil
// A cert issued or revoked here will be double-counted. That's okay, this is "best effort" metrics.
b.certsCounted.Store(false)
entries, err := b.storage.List(ctx, "certs/")
if err != nil {
return err
}
atomic.AddUint32(b.certCount, uint32(len(entries)))
b.certCount.Add(uint32(len(entries)))
revokedEntries, err := b.storage.List(ctx, "revoked/")
if err != nil {
return err
}
atomic.AddUint32(b.revokedCertCount, uint32(len(revokedEntries)))
b.revokedCertCount.Add(uint32(len(revokedEntries)))
b.certsCounted.Store(true)
// Now that the metrics are set, we can switch from appending newly-stored certificates to the possible double-count
@ -723,64 +764,98 @@ func (b *backend) initializeStoredCertificateCounts(ctx context.Context) error {
b.possibleDoubleCountedRevokedSerials = nil
b.possibleDoubleCountedSerials = nil
certCount := atomic.LoadUint32(b.certCount)
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount))
revokedCertCount := atomic.LoadUint32(b.revokedCertCount)
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount))
b.emitCertStoreMetrics(config)
b.certCountError = ""
return nil
}
// The "certsCounted" boolean here should be loaded from the backend certsCounted before the corresponding storage call:
// eg. certsCounted := b.certsCounted.Load()
func (b *backend) incrementTotalCertificatesCount(certsCounted bool, newSerial string) {
certCount := atomic.AddUint32(b.certCount, 1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "certs/") {
newSerial = newSerial[6:]
}
b.possibleDoubleCountedSerials = append(b.possibleDoubleCountedSerials, newSerial)
default:
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount))
func (b *backend) emitCertStoreMetrics(config *tidyConfig) {
if config.PublishMetrics == true {
certCount := b.certCount.Load()
b.emitTotalCertCountMetric(certCount)
revokedCertCount := b.revokedCertCount.Load()
b.emitTotalRevokedCountMetric(revokedCertCount)
}
}
func (b *backend) decrementTotalCertificatesCountReport() {
certCount := b.decrementTotalCertificatesCountNoReport()
// The "certsCounted" boolean here should be loaded from the backend certsCounted before the corresponding storage call:
// eg. certsCounted := b.certsCounted.Load()
func (b *backend) ifCountEnabledIncrementTotalCertificatesCount(certsCounted bool, newSerial string) {
if b.certCountEnabled.Load() {
certCount := b.certCount.Add(1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "certs/") {
newSerial = newSerial[6:]
}
b.possibleDoubleCountedSerials = append(b.possibleDoubleCountedSerials, newSerial)
default:
if b.publishCertCountMetrics.Load() {
b.emitTotalCertCountMetric(certCount)
}
}
}
}
func (b *backend) ifCountEnabledDecrementTotalCertificatesCountReport() {
if b.certCountEnabled.Load() {
certCount := b.decrementTotalCertificatesCountNoReport()
if b.publishCertCountMetrics.Load() {
b.emitTotalCertCountMetric(certCount)
}
}
}
func (b *backend) emitTotalCertCountMetric(certCount uint32) {
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_certificates_stored"}, float32(certCount))
}
// Called directly only by the initialize function to deduplicate the count, when we don't have a full count yet
// Does not respect whether-we-are-counting backend information.
func (b *backend) decrementTotalCertificatesCountNoReport() uint32 {
newCount := atomic.AddUint32(b.certCount, ^uint32(0))
newCount := b.certCount.Add(^uint32(0))
return newCount
}
// The "certsCounted" boolean here should be loaded from the backend certsCounted before the corresponding storage call:
// eg. certsCounted := b.certsCounted.Load()
func (b *backend) incrementTotalRevokedCertificatesCount(certsCounted bool, newSerial string) {
newRevokedCertCount := atomic.AddUint32(b.revokedCertCount, 1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "revoked/") { // allow passing in the path (revoked/serial) OR the serial
newSerial = newSerial[8:]
func (b *backend) ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted bool, newSerial string) {
if b.certCountEnabled.Load() {
newRevokedCertCount := b.revokedCertCount.Add(1)
switch {
case !certsCounted:
// This is unsafe, but a good best-attempt
if strings.HasPrefix(newSerial, "revoked/") { // allow passing in the path (revoked/serial) OR the serial
newSerial = newSerial[8:]
}
b.possibleDoubleCountedRevokedSerials = append(b.possibleDoubleCountedRevokedSerials, newSerial)
default:
if b.publishCertCountMetrics.Load() {
b.emitTotalRevokedCountMetric(newRevokedCertCount)
}
}
b.possibleDoubleCountedRevokedSerials = append(b.possibleDoubleCountedRevokedSerials, newSerial)
default:
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(newRevokedCertCount))
}
}
func (b *backend) decrementTotalRevokedCertificatesCountReport() {
revokedCertCount := b.decrementTotalRevokedCertificatesCountNoReport()
func (b *backend) ifCountEnabledDecrementTotalRevokedCertificatesCountReport() {
if b.certCountEnabled.Load() {
revokedCertCount := b.decrementTotalRevokedCertificatesCountNoReport()
if b.publishCertCountMetrics.Load() {
b.emitTotalRevokedCountMetric(revokedCertCount)
}
}
}
func (b *backend) emitTotalRevokedCountMetric(revokedCertCount uint32) {
metrics.SetGauge([]string{"secrets", "pki", b.backendUUID, "total_revoked_certificates_stored"}, float32(revokedCertCount))
}
// Called directly only by the initialize function to deduplicate the count, when we don't have a full count yet
// Does not respect whether-we-are-counting backend information.
func (b *backend) decrementTotalRevokedCertificatesCountNoReport() uint32 {
newRevokedCertCount := atomic.AddUint32(b.revokedCertCount, ^uint32(0))
newRevokedCertCount := b.revokedCertCount.Add(^uint32(0))
return newRevokedCertCount
}

View File

@ -27,7 +27,6 @@ import (
"strconv"
"strings"
"sync"
"sync/atomic"
"testing"
"time"
@ -3797,6 +3796,15 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
t.Fatal(err)
}
// Set up Metric Configuration, then restart to enable it
_, err = client.Logical().Write("pki/config/auto-tidy", map[string]interface{}{
"maintain_stored_certificate_counts": true,
"publish_stored_certificate_count_metrics": true,
})
_, err = client.Logical().Write("/sys/plugins/reload/backend", map[string]interface{}{
"mounts": "pki/",
})
// Check the metrics initialized in order to calculate backendUUID for /pki
// BackendUUID not consistent during tests with UUID from /sys/mounts/pki
metricsSuffix := "total_certificates_stored"
@ -3833,6 +3841,14 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
if err != nil {
t.Fatal(err)
}
// Set up Metric Configuration, then restart to enable it
_, err = client.Logical().Write("pki2/config/auto-tidy", map[string]interface{}{
"maintain_stored_certificate_counts": true,
"publish_stored_certificate_count_metrics": true,
})
_, err = client.Logical().Write("/sys/plugins/reload/backend", map[string]interface{}{
"mounts": "pki2/",
})
// Create a CSR for the intermediate CA
secret, err := client.Logical().Write("pki2/intermediate/generate/internal", nil)
@ -3961,6 +3977,7 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
"current_revoked_cert_count": json.Number("0"),
"revocation_queue_deleted_count": json.Number("0"),
"cross_revoked_cert_deleted_count": json.Number("0"),
"internal_backend_uuid": backendUUID,
}
// Let's copy the times from the response so that we can use deep.Equal()
timeStarted, ok := tidyStatus.Data["time_started"]
@ -5590,6 +5607,14 @@ func TestBackend_InitializeCertificateCounts(t *testing.T) {
serials[i] = resp.Data["serial_number"].(string)
}
// Turn on certificate counting:
CBWrite(b, s, "config/auto-tidy", map[string]interface{}{
"maintain_stored_certificate_counts": true,
"publish_stored_certificate_count_metrics": false,
})
// Assert initialize from clean is correct:
b.initializeStoredCertificateCounts(ctx)
// Revoke certificates A + B
revocations := serials[0:2]
for _, key := range revocations {
@ -5601,18 +5626,16 @@ func TestBackend_InitializeCertificateCounts(t *testing.T) {
}
}
// Assert initialize from clean is correct:
b.initializeStoredCertificateCounts(ctx)
if atomic.LoadUint32(b.certCount) != 6 {
t.Fatalf("Failed to count six certificates root,A,B,C,D,E, instead counted %d certs", atomic.LoadUint32(b.certCount))
if b.certCount.Load() != 6 {
t.Fatalf("Failed to count six certificates root,A,B,C,D,E, instead counted %d certs", b.certCount.Load())
}
if atomic.LoadUint32(b.revokedCertCount) != 2 {
t.Fatalf("Failed to count two revoked certificates A+B, instead counted %d certs", atomic.LoadUint32(b.revokedCertCount))
if b.revokedCertCount.Load() != 2 {
t.Fatalf("Failed to count two revoked certificates A+B, instead counted %d certs", b.revokedCertCount.Load())
}
// Simulates listing while initialize in progress, by "restarting it"
atomic.StoreUint32(b.certCount, 0)
atomic.StoreUint32(b.revokedCertCount, 0)
b.certCount.Store(0)
b.revokedCertCount.Store(0)
b.certsCounted.Store(false)
// Revoke certificates C, D
@ -5641,12 +5664,12 @@ func TestBackend_InitializeCertificateCounts(t *testing.T) {
b.initializeStoredCertificateCounts(ctx)
// Test certificate count
if atomic.LoadUint32(b.certCount) != 8 {
t.Fatalf("Failed to initialize count of certificates root, A,B,C,D,E,F,G counted %d certs", *(b.certCount))
if b.certCount.Load() != 8 {
t.Fatalf("Failed to initialize count of certificates root, A,B,C,D,E,F,G counted %d certs", b.certCount.Load())
}
if atomic.LoadUint32(b.revokedCertCount) != 4 {
t.Fatalf("Failed to count revoked certificates A,B,C,D counted %d certs", *(b.revokedCertCount))
if b.revokedCertCount.Load() != 4 {
t.Fatalf("Failed to count revoked certificates A,B,C,D counted %d certs", b.revokedCertCount.Load())
}
return

View File

@ -234,9 +234,9 @@ func fetchCertBySerial(sc *storageContext, prefix, serial string) (*logical.Stor
// If we fail here, we have an extra (copy) of a cert in storage, add to metrics:
switch {
case strings.HasPrefix(prefix, "revoked/"):
sc.Backend.incrementTotalRevokedCertificatesCount(certsCounted, path)
sc.Backend.ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted, path)
default:
sc.Backend.incrementTotalCertificatesCount(certsCounted, path)
sc.Backend.ifCountEnabledIncrementTotalCertificatesCount(certsCounted, path)
}
return nil, errutil.InternalError{Err: fmt.Sprintf("error deleting certificate with serial %s from old location", serial)}
}

View File

@ -978,7 +978,7 @@ func revokeCert(sc *storageContext, config *crlConfig, cert *x509.Certificate) (
if err != nil {
return nil, fmt.Errorf("error saving revoked certificate to new location")
}
sc.Backend.incrementTotalRevokedCertificatesCount(certsCounted, revEntry.Key)
sc.Backend.ifCountEnabledIncrementTotalRevokedCertificatesCount(certsCounted, revEntry.Key)
// If this flag is enabled after the fact, existing local entries will be published to
// the unified storage space through a periodic function.

View File

@ -518,6 +518,23 @@ greater period of time. By default this is zero seconds.`,
Default: "0s",
}
fields["maintain_stored_certificate_counts"] = &framework.FieldSchema{
Type: framework.TypeBool,
Description: `This configures whether stored certificates
are counted upon initialization of the backend, and whether during
normal operation, a running count of certificates stored is maintained.`,
Default: false,
}
fields["publish_stored_certificate_count_metrics"] = &framework.FieldSchema{
Type: framework.TypeBool,
Description: `This configures whether the stored certificate
count is published to the metrics consumer. It does not affect if the
stored certificate count is maintained, and if maintained, it will be
available on the tidy-status endpoint.`,
Default: false,
}
fields["tidy_revocation_queue"] = &framework.FieldSchema{
Type: framework.TypeBool,
Description: `Set to true to remove stale revocation queue entries

View File

@ -419,7 +419,7 @@ func (b *backend) pathIssueSignCert(ctx context.Context, req *logical.Request, d
if err != nil {
return nil, fmt.Errorf("unable to store certificate locally: %w", err)
}
b.incrementTotalCertificatesCount(certsCounted, key)
b.ifCountEnabledIncrementTotalCertificatesCount(certsCounted, key)
}
if useCSR {

View File

@ -271,7 +271,7 @@ func (b *backend) pathCAGenerateRoot(ctx context.Context, req *logical.Request,
if err != nil {
return nil, fmt.Errorf("unable to store certificate locally: %w", err)
}
b.incrementTotalCertificatesCount(certsCounted, key)
b.ifCountEnabledIncrementTotalCertificatesCount(certsCounted, key)
// Build a fresh CRL
err = b.crlBuilder.rebuild(sc, true)
@ -468,7 +468,7 @@ func (b *backend) pathIssuerSignIntermediate(ctx context.Context, req *logical.R
if err != nil {
return nil, fmt.Errorf("unable to store certificate locally: %w", err)
}
b.incrementTotalCertificatesCount(certsCounted, key)
b.ifCountEnabledIncrementTotalCertificatesCount(certsCounted, key)
if parsedBundle.Certificate.MaxPathLen == 0 {
resp.AddWarning("Max path length of the signed certificate is zero. This certificate cannot be used to issue intermediate CA certificates.")

View File

@ -70,6 +70,8 @@ type tidyConfig struct {
SafetyBuffer time.Duration `json:"safety_buffer"`
IssuerSafetyBuffer time.Duration `json:"issuer_safety_buffer"`
PauseDuration time.Duration `json:"pause_duration"`
MaintainCount bool `json:"maintain_stored_certificate_counts"`
PublishMetrics bool `json:"publish_stored_certificate_count_metrics"`
RevocationQueue bool `json:"tidy_revocation_queue"`
QueueSafetyBuffer time.Duration `json:"revocation_queue_safety_buffer"`
CrossRevokedCerts bool `json:"tidy_cross_cluster_revoked_certs"`
@ -86,6 +88,8 @@ var defaultTidyConfig = tidyConfig{
SafetyBuffer: 72 * time.Hour,
IssuerSafetyBuffer: 365 * 24 * time.Hour,
PauseDuration: 0 * time.Second,
MaintainCount: false,
PublishMetrics: false,
RevocationQueue: false,
QueueSafetyBuffer: 48 * time.Hour,
CrossRevokedCerts: false,
@ -973,11 +977,26 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f
"missing_issuer_cert_count": nil,
"current_cert_store_count": nil,
"current_revoked_cert_count": nil,
"internal_backend_uuid": nil,
"revocation_queue_deleted_count": nil,
"cross_revoked_cert_deleted_count": nil,
},
}
resp.Data["internal_backend_uuid"] = b.backendUUID
if b.certCountEnabled.Load() {
resp.Data["current_cert_store_count"] = b.certCount.Load()
resp.Data["current_revoked_cert_count"] = b.revokedCertCount.Load()
if !b.certsCounted.Load() {
resp.AddWarning("Certificates in storage are still being counted, current counts provided may be " +
"inaccurate")
}
if b.certCountError != "" {
resp.Data["certificate_counting_error"] = b.certCountError
}
}
if b.tidyStatus.state == tidyStatusInactive {
return resp, nil
}
@ -1020,14 +1039,6 @@ func (b *backend) pathTidyStatusRead(_ context.Context, _ *logical.Request, _ *f
resp.Data["time_finished"] = b.tidyStatus.timeFinished
}
resp.Data["current_cert_store_count"] = atomic.LoadUint32(b.certCount)
resp.Data["current_revoked_cert_count"] = atomic.LoadUint32(b.revokedCertCount)
if !b.certsCounted.Load() {
resp.AddWarning("Certificates in storage are still being counted, current counts provided may be " +
"inaccurate")
}
return resp, nil
}
@ -1040,19 +1051,21 @@ func (b *backend) pathConfigAutoTidyRead(ctx context.Context, req *logical.Reque
return &logical.Response{
Data: map[string]interface{}{
"enabled": config.Enabled,
"interval_duration": int(config.Interval / time.Second),
"tidy_cert_store": config.CertStore,
"tidy_revoked_certs": config.RevokedCerts,
"tidy_revoked_cert_issuer_associations": config.IssuerAssocs,
"tidy_expired_issuers": config.ExpiredIssuers,
"tidy_move_legacy_ca_bundle": config.BackupBundle,
"safety_buffer": int(config.SafetyBuffer / time.Second),
"issuer_safety_buffer": int(config.IssuerSafetyBuffer / time.Second),
"pause_duration": config.PauseDuration.String(),
"tidy_revocation_queue": config.RevocationQueue,
"revocation_queue_safety_buffer": int(config.QueueSafetyBuffer / time.Second),
"tidy_cross_cluster_revoked_certs": config.CrossRevokedCerts,
"enabled": config.Enabled,
"interval_duration": int(config.Interval / time.Second),
"tidy_cert_store": config.CertStore,
"tidy_revoked_certs": config.RevokedCerts,
"tidy_revoked_cert_issuer_associations": config.IssuerAssocs,
"tidy_expired_issuers": config.ExpiredIssuers,
"safety_buffer": int(config.SafetyBuffer / time.Second),
"issuer_safety_buffer": int(config.IssuerSafetyBuffer / time.Second),
"pause_duration": config.PauseDuration.String(),
"publish_stored_certificate_count_metrics": config.PublishMetrics,
"maintain_stored_certificate_counts": config.MaintainCount,
"tidy_move_legacy_ca_bundle": config.BackupBundle,
"tidy_revocation_queue": config.RevocationQueue,
"revocation_queue_safety_buffer": int(config.QueueSafetyBuffer / time.Second),
"tidy_cross_cluster_revoked_certs": config.CrossRevokedCerts,
},
}, nil
}
@ -1139,6 +1152,17 @@ func (b *backend) pathConfigAutoTidyWrite(ctx context.Context, req *logical.Requ
return logical.ErrorResponse("Auto-tidy enabled but no tidy operations were requested. Enable at least one tidy operation to be run (tidy_cert_store / tidy_revoked_certs / tidy_revoked_cert_issuer_associations / tidy_expired_issuers / tidy_move_legacy_ca_bundle / tidy_revocation_queue / tidy_cross_cluster_revoked_certs)."), nil
}
if maintainCountEnabledRaw, ok := d.GetOk("maintain_stored_certificate_counts"); ok {
config.MaintainCount = maintainCountEnabledRaw.(bool)
}
if runningStorageMetricsEnabledRaw, ok := d.GetOk("publish_stored_certificate_count_metrics"); ok {
if config.MaintainCount == false {
return logical.ErrorResponse("Can not publish a running storage metrics count to metrics without first maintaining that count. Enable `maintain_stored_certificate_counts` to enable `publish_stored_certificate_count_metrics."), nil
}
config.PublishMetrics = runningStorageMetricsEnabledRaw.(bool)
}
if err := sc.writeAutoTidyConfig(config); err != nil {
return nil, err
}
@ -1224,7 +1248,7 @@ func (b *backend) tidyStatusIncCertStoreCount() {
b.tidyStatus.certStoreDeletedCount++
b.decrementTotalCertificatesCountReport()
b.ifCountEnabledDecrementTotalCertificatesCountReport()
}
func (b *backend) tidyStatusIncRevokedCertCount() {
@ -1233,7 +1257,7 @@ func (b *backend) tidyStatusIncRevokedCertCount() {
b.tidyStatus.revokedCertDeletedCount++
b.decrementTotalRevokedCertificatesCountReport()
b.ifCountEnabledDecrementTotalRevokedCertificatesCountReport()
}
func (b *backend) tidyStatusIncMissingIssuerCertCount() {

View File

@ -2,9 +2,15 @@ package pki
import (
"encoding/json"
"errors"
"fmt"
"testing"
"time"
"github.com/hashicorp/vault/helper/testhelpers"
"github.com/armon/go-metrics"
"github.com/hashicorp/vault/api"
vaulthttp "github.com/hashicorp/vault/http"
"github.com/hashicorp/vault/sdk/logical"
@ -405,3 +411,338 @@ func TestTidyIssuerConfig(t *testing.T) {
require.Equal(t, true, resp.Data["tidy_expired_issuers"])
require.Equal(t, 5, resp.Data["issuer_safety_buffer"])
}
// TestCertStorageMetrics ensures that when enabled, metrics are able to count the number of certificates in storage and
// number of revoked certificates in storage. Moreover, this test ensures that the gauge is emitted periodically, so
// that the metric does not disappear or go stale.
func TestCertStorageMetrics(t *testing.T) {
// This tests uses the same setup as TestAutoTidy
newPeriod := 1 * time.Second
// We set up a metrics accumulator
inmemSink := metrics.NewInmemSink(
2*newPeriod, // A short time period is ideal here to test metrics are emitted every periodic func
2000000*time.Hour)
metricsConf := metrics.DefaultConfig("")
metricsConf.EnableHostname = false
metricsConf.EnableHostnameLabel = false
metricsConf.EnableServiceLabel = false
metricsConf.EnableTypePrefix = false
_, err := metrics.NewGlobal(metricsConf, inmemSink)
if err != nil {
t.Fatal(err)
}
// This test requires the periodicFunc to trigger, which requires we stand
// up a full test cluster.
coreConfig := &vault.CoreConfig{
LogicalBackends: map[string]logical.Factory{
"pki": Factory,
},
// See notes below about usage of /sys/raw for reading cluster
// storage without barrier encryption.
EnableRaw: true,
RollbackPeriod: newPeriod,
}
cluster := vault.NewTestCluster(t, coreConfig, &vault.TestClusterOptions{
HandlerFunc: vaulthttp.Handler,
})
cluster.Start()
defer cluster.Cleanup()
client := cluster.Cores[0].Client
// Mount PKI
err = client.Sys().Mount("pki", &api.MountInput{
Type: "pki",
Config: api.MountConfigInput{
DefaultLeaseTTL: "10m",
MaxLeaseTTL: "60m",
},
})
require.NoError(t, err)
// Generate root.
resp, err := client.Logical().Write("pki/root/generate/internal", map[string]interface{}{
"ttl": "40h",
"common_name": "Root X1",
"key_type": "ec",
})
require.NoError(t, err)
require.NotNil(t, resp)
require.NotEmpty(t, resp.Data)
require.NotEmpty(t, resp.Data["issuer_id"])
// Set up a testing role.
_, err = client.Logical().Write("pki/roles/local-testing", map[string]interface{}{
"allow_any_name": true,
"enforce_hostnames": false,
"key_type": "ec",
})
require.NoError(t, err)
// Run tidy so that tidy-status is not empty
_, err = client.Logical().Write("pki/tidy", map[string]interface{}{
"tidy_revoked_certs": true,
})
require.NoError(t, err)
// Since certificate counts are off by default, we shouldn't see counts in the tidy status
tidyStatus, err := client.Logical().Read("pki/tidy-status")
if err != nil {
t.Fatal(err)
}
// backendUUID should exist, we need this for metrics
backendUUID := tidyStatus.Data["internal_backend_uuid"].(string)
// "current_cert_store_count", "current_revoked_cert_count"
countData, ok := tidyStatus.Data["current_cert_store_count"]
if ok && countData != nil {
t.Fatalf("Certificate counting should be off by default, but current cert store count %v appeared in tidy status in unconfigured mount", countData)
}
revokedCountData, ok := tidyStatus.Data["current_revoked_cert_count"]
if ok && revokedCountData != nil {
t.Fatalf("Certificate counting should be off by default, but revoked cert count %v appeared in tidy status in unconfigured mount", revokedCountData)
}
// Since certificate counts are off by default, those metrics should not exist yet
mostRecentInterval := inmemSink.Data()[len(inmemSink.Data())-1]
_, ok = mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"]
if ok {
t.Fatalf("Certificate counting should be off by default, but revoked cert count was emitted as a metric in an unconfigured mount")
}
_, ok = mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_certificates_stored"]
if ok {
t.Fatalf("Certificate counting should be off by default, but total certificate count was emitted as a metric in an unconfigured mount")
}
// Write the auto-tidy config.
_, err = client.Logical().Write("pki/config/auto-tidy", map[string]interface{}{
"enabled": true,
"interval_duration": "1s",
"tidy_cert_store": true,
"tidy_revoked_certs": true,
"safety_buffer": "1s",
"maintain_stored_certificate_counts": true,
"publish_stored_certificate_count_metrics": false,
})
require.NoError(t, err)
// Reload the Mount - Otherwise Stored Certificate Counts Will Not Be Populated
_, err = client.Logical().Write("/sys/plugins/reload/backend", map[string]interface{}{
"plugin": "pki",
})
// By reading the auto-tidy endpoint, we ensure that initialize has completed (which has a write lock on auto-tidy)
_, err = client.Logical().Read("/pki/config/auto-tidy")
if err != nil {
t.Fatal(err)
}
// Since publish_stored_certificate_count_metrics is still false, these metrics should still not exist yet
mostRecentInterval = inmemSink.Data()[len(inmemSink.Data())-1]
_, ok = mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"]
if ok {
t.Fatalf("Certificate counting should be off by default, but revoked cert count was emitted as a metric in an unconfigured mount")
}
_, ok = mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_certificates_stored"]
if ok {
t.Fatalf("Certificate counting should be off by default, but total certificate count was emitted as a metric in an unconfigured mount")
}
// But since certificate counting is on, the metrics should exist on tidyStatus endpoint:
tidyStatus, err = client.Logical().Read("pki/tidy-status")
if err != nil {
t.Fatal(err)
}
// backendUUID should exist, we need this for metrics
backendUUID = tidyStatus.Data["internal_backend_uuid"].(string)
// "current_cert_store_count", "current_revoked_cert_count"
certStoreCount, ok := tidyStatus.Data["current_cert_store_count"]
if !ok {
t.Fatalf("Certificate counting has been turned on, but current cert store count does not appear in tidy status")
}
if certStoreCount != json.Number("1") {
t.Fatalf("Only created one certificate, but a got a certificate count of %v", certStoreCount)
}
revokedCertCount, ok := tidyStatus.Data["current_revoked_cert_count"]
if !ok {
t.Fatalf("Certificate counting has been turned on, but revoked cert store count does not appear in tidy status")
}
if revokedCertCount != json.Number("0") {
t.Fatalf("Have not yet revoked a certificate, but got a revoked cert store count of %v", revokedCertCount)
}
// Write the auto-tidy config, again, this time turning on metrics
_, err = client.Logical().Write("pki/config/auto-tidy", map[string]interface{}{
"enabled": true,
"interval_duration": "1s",
"tidy_cert_store": true,
"tidy_revoked_certs": true,
"safety_buffer": "1s",
"maintain_stored_certificate_counts": true,
"publish_stored_certificate_count_metrics": true,
})
require.NoError(t, err)
// Issue a cert and revoke it.
resp, err = client.Logical().Write("pki/issue/local-testing", map[string]interface{}{
"common_name": "example.com",
"ttl": "10s",
})
require.NoError(t, err)
require.NotNil(t, resp)
require.NotNil(t, resp.Data)
require.NotEmpty(t, resp.Data["serial_number"])
require.NotEmpty(t, resp.Data["certificate"])
leafSerial := resp.Data["serial_number"].(string)
leafCert := parseCert(t, resp.Data["certificate"].(string))
// Read cert before revoking
resp, err = client.Logical().Read("pki/cert/" + leafSerial)
require.NoError(t, err)
require.NotNil(t, resp)
require.NotNil(t, resp.Data)
require.NotEmpty(t, resp.Data["certificate"])
revocationTime, err := (resp.Data["revocation_time"].(json.Number)).Int64()
require.Equal(t, int64(0), revocationTime, "revocation time was not zero")
require.Empty(t, resp.Data["revocation_time_rfc3339"], "revocation_time_rfc3339 was not empty")
require.Empty(t, resp.Data["issuer_id"], "issuer_id was not empty")
_, err = client.Logical().Write("pki/revoke", map[string]interface{}{
"serial_number": leafSerial,
})
require.NoError(t, err)
// We read the auto-tidy endpoint again, to ensure any metrics logic has completed (lock on config)
_, err = client.Logical().Read("/pki/config/auto-tidy")
if err != nil {
t.Fatal(err)
}
// Check Metrics After Cert Has Be Created and Revoked
tidyStatus, err = client.Logical().Read("pki/tidy-status")
if err != nil {
t.Fatal(err)
}
backendUUID = tidyStatus.Data["internal_backend_uuid"].(string)
certStoreCount, ok = tidyStatus.Data["current_cert_store_count"]
if !ok {
t.Fatalf("Certificate counting has been turned on, but current cert store count does not appear in tidy status")
}
if certStoreCount != json.Number("2") {
t.Fatalf("Created root and leaf certificate, but a got a certificate count of %v", certStoreCount)
}
revokedCertCount, ok = tidyStatus.Data["current_revoked_cert_count"]
if !ok {
t.Fatalf("Certificate counting has been turned on, but revoked cert store count does not appear in tidy status")
}
if revokedCertCount != json.Number("1") {
t.Fatalf("Revoked one certificate, but got a revoked cert store count of %v", revokedCertCount)
}
// This should now be initialized
certCountError, ok := tidyStatus.Data["certificate_counting_error"]
if ok && certCountError.(string) != "" {
t.Fatalf("Expected certificate count error to disappear after initialization, but got error %v", certCountError)
}
testhelpers.RetryUntil(t, newPeriod*5, func() error {
mostRecentInterval = inmemSink.Data()[len(inmemSink.Data())-1]
revokedCertCountGaugeValue, ok := mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"]
if !ok {
return errors.New("turned on metrics, but revoked cert count was not emitted")
}
if revokedCertCountGaugeValue.Value != 1 {
return fmt.Errorf("revoked one certificate, but metrics emitted a revoked cert store count of %v", revokedCertCountGaugeValue)
}
certStoreCountGaugeValue, ok := mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_certificates_stored"]
if !ok {
return errors.New("turned on metrics, but total certificate count was not emitted")
}
if certStoreCountGaugeValue.Value != 2 {
return fmt.Errorf("stored two certificiates, but total certificate count emitted was %v", certStoreCountGaugeValue.Value)
}
return nil
})
// Wait for cert to expire and the safety buffer to elapse.
time.Sleep(time.Until(leafCert.NotAfter) + 3*time.Second)
// Wait for auto-tidy to run afterwards.
var foundTidyRunning string
var foundTidyFinished bool
timeoutChan := time.After(120 * time.Second)
for {
if foundTidyRunning != "" && foundTidyFinished {
break
}
select {
case <-timeoutChan:
t.Fatalf("expected auto-tidy to run (%v) and finish (%v) before 120 seconds elapsed", foundTidyRunning, foundTidyFinished)
default:
time.Sleep(250 * time.Millisecond)
resp, err = client.Logical().Read("pki/tidy-status")
require.NoError(t, err)
require.NotNil(t, resp)
require.NotNil(t, resp.Data)
require.NotEmpty(t, resp.Data["state"])
require.NotEmpty(t, resp.Data["time_started"])
state := resp.Data["state"].(string)
started := resp.Data["time_started"].(string)
t.Logf("Resp: %v", resp.Data)
// We want the _next_ tidy run after the cert expires. This
// means if we're currently finished when we hit this the
// first time, we want to wait for the next run.
if foundTidyRunning == "" {
foundTidyRunning = started
} else if foundTidyRunning != started && !foundTidyFinished && state == "Finished" {
foundTidyFinished = true
}
}
}
// After Tidy, Cert Store Count Should Still Be Available, and Be Updated:
// Check Metrics After Cert Has Be Created and Revoked
tidyStatus, err = client.Logical().Read("pki/tidy-status")
if err != nil {
t.Fatal(err)
}
backendUUID = tidyStatus.Data["internal_backend_uuid"].(string)
// "current_cert_store_count", "current_revoked_cert_count"
certStoreCount, ok = tidyStatus.Data["current_cert_store_count"]
if !ok {
t.Fatalf("Certificate counting has been turned on, but current cert store count does not appear in tidy status")
}
if certStoreCount != json.Number("1") {
t.Fatalf("Created root and leaf certificate, deleted leaf, but a got a certificate count of %v", certStoreCount)
}
revokedCertCount, ok = tidyStatus.Data["current_revoked_cert_count"]
if !ok {
t.Fatalf("Certificate counting has been turned on, but revoked cert store count does not appear in tidy status")
}
if revokedCertCount != json.Number("0") {
t.Fatalf("Revoked certificate has been tidied, but got a revoked cert store count of %v", revokedCertCount)
}
testhelpers.RetryUntil(t, newPeriod*5, func() error {
mostRecentInterval = inmemSink.Data()[len(inmemSink.Data())-1]
revokedCertCountGaugeValue, ok := mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_revoked_certificates_stored"]
if !ok {
return errors.New("turned on metrics, but revoked cert count was not emitted")
}
if revokedCertCountGaugeValue.Value != 0 {
return fmt.Errorf("revoked certificate has been tidied, but metrics emitted a revoked cert store count of %v", revokedCertCountGaugeValue)
}
certStoreCountGaugeValue, ok := mostRecentInterval.Gauges["secrets.pki."+backendUUID+".total_certificates_stored"]
if !ok {
return errors.New("turned on metrics, but total certificate count was not emitted")
}
if certStoreCountGaugeValue.Value != 1 {
return fmt.Errorf("only one of two certificates left after tidy, but total certificate count emitted was %v", certStoreCountGaugeValue.Value)
}
return nil
})
}

View File

@ -1351,7 +1351,33 @@ func (sc *storageContext) writeAutoTidyConfig(config *tidyConfig) error {
return err
}
return sc.Storage.Put(sc.Context, entry)
err = sc.Storage.Put(sc.Context, entry)
if err != nil {
return err
}
sc.Backend.publishCertCountMetrics.Store(config.PublishMetrics)
// To Potentially Disable Certificate Counting
if config.MaintainCount == false {
certCountWasEnabled := sc.Backend.certCountEnabled.Swap(config.MaintainCount)
if certCountWasEnabled {
sc.Backend.certsCounted.Store(true)
sc.Backend.certCountError = "Cert Count is Disabled: enable via Tidy Config maintain_stored_certificate_counts"
sc.Backend.possibleDoubleCountedSerials = nil // This won't stop a list operation, but will stop an expensive clean-up during initialize
sc.Backend.possibleDoubleCountedRevokedSerials = nil // This won't stop a list operation, but will stop an expensive clean-up during initialize
sc.Backend.certCount.Store(0)
sc.Backend.revokedCertCount.Store(0)
}
} else { // To Potentially Enable Certificate Counting
if sc.Backend.certCountEnabled.Load() == false {
// We haven't written "re-enable certificate counts" outside the initialize function
// Any call derived call to do so is likely to time out on ~2 million certs
sc.Backend.certCountError = "Certificate Counting Has Not Been Initialized, re-initialize this mount"
}
}
return nil
}
func (sc *storageContext) listRevokedCerts() ([]string, error) {

6
changelog/18186.txt Normal file
View File

@ -0,0 +1,6 @@
```release-note:breaking-change
secrets/pki: Maintaining running count of certificates will be turned off by default.
To re-enable keeping these metrics available on the tidy status endpoint, enable
maintain_stored_certificate_counts on tidy-config, to also publish them to the
metrics consumer, enable publish_stored_certificate_count_metrics .
```