VAULT-444: Add PKI tidy-status endpoint. (#12885)

VAULT-444: Add PKI tidy-status endpoint.

Add metrics so that the PKI tidy status can be monitored using telemetry as well.

Co-authored-by: Steven Clark <steven.clark@hashicorp.com>
This commit is contained in:
Victor Rodriguez 2021-11-02 11:12:49 -04:00 committed by GitHub
parent d37da52974
commit f6e35369f0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 435 additions and 36 deletions

View File

@ -3,11 +3,11 @@ package pki
import ( import (
"context" "context"
"strings" "strings"
"sync"
"time" "time"
"github.com/hashicorp/vault/sdk/framework" "github.com/hashicorp/vault/sdk/framework"
"github.com/hashicorp/vault/sdk/logical" "github.com/hashicorp/vault/sdk/logical"
"github.com/hashicorp/vault/vault"
) )
// Factory creates a new backend implementing the logical.Backend interface // Factory creates a new backend implementing the logical.Backend interface
@ -75,6 +75,7 @@ func Backend(conf *logical.BackendConfig) *backend {
pathFetchListCerts(&b), pathFetchListCerts(&b),
pathRevoke(&b), pathRevoke(&b),
pathTidy(&b), pathTidy(&b),
pathTidyStatus(&b),
}, },
Secrets: []*framework.Secret{ Secrets: []*framework.Secret{
@ -86,6 +87,7 @@ func Backend(conf *logical.BackendConfig) *backend {
b.crlLifetime = time.Hour * 72 b.crlLifetime = time.Hour * 72
b.tidyCASGuard = new(uint32) b.tidyCASGuard = new(uint32)
b.tidyStatus = &tidyStatus{state: tidyStatusInactive}
b.storage = conf.StorageView b.storage = conf.StorageView
return &b return &b
@ -96,8 +98,36 @@ type backend struct {
storage logical.Storage storage logical.Storage
crlLifetime time.Duration crlLifetime time.Duration
revokeStorageLock sync.RWMutex revokeStorageLock vault.DeadlockRWMutex
tidyCASGuard *uint32 tidyCASGuard *uint32
tidyStatusLock vault.DeadlockRWMutex
tidyStatus *tidyStatus
}
type tidyStatusState int
const (
tidyStatusInactive tidyStatusState = iota
tidyStatusStarted
tidyStatusFinished
tidyStatusError
)
type tidyStatus struct {
// Parameters used to initiate the operation
safetyBuffer int
tidyCertStore bool
tidyRevokedCerts bool
// Status
state tidyStatusState
err error
timeStarted time.Time
timeFinished time.Time
message string
certStoreDeletedCount uint
revokedCertDeletedCount uint
} }
const backendHelp = ` const backendHelp = `

View File

@ -12,6 +12,7 @@ import (
"crypto/x509" "crypto/x509"
"crypto/x509/pkix" "crypto/x509/pkix"
"encoding/base64" "encoding/base64"
"encoding/json"
"encoding/pem" "encoding/pem"
"fmt" "fmt"
"io/ioutil" "io/ioutil"
@ -29,6 +30,7 @@ import (
"testing" "testing"
"time" "time"
"github.com/armon/go-metrics"
"github.com/fatih/structs" "github.com/fatih/structs"
"github.com/go-test/deep" "github.com/go-test/deep"
"github.com/hashicorp/go-secure-stdlib/strutil" "github.com/hashicorp/go-secure-stdlib/strutil"
@ -3092,6 +3094,22 @@ func setCerts() {
} }
func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) { func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
// Use a ridiculously long time to minimize the chance
// that we have to deal with more than one interval.
// InMemSink rounds down to an interval boundary rather than
// starting one at the time of initialization.
inmemSink := metrics.NewInmemSink(
1000000*time.Hour,
2000000*time.Hour)
metricsConf := metrics.DefaultConfig("")
metricsConf.EnableHostname = false
metricsConf.EnableHostnameLabel = false
metricsConf.EnableServiceLabel = false
metricsConf.EnableTypePrefix = false
metrics.NewGlobal(metricsConf, inmemSink)
// Enable PKI secret engine // Enable PKI secret engine
coreConfig := &vault.CoreConfig{ coreConfig := &vault.CoreConfig{
LogicalBackends: map[string]logical.Factory{ LogicalBackends: map[string]logical.Factory{
@ -3243,6 +3261,91 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
// Sleep a bit to make sure we're past the safety buffer // Sleep a bit to make sure we're past the safety buffer
time.Sleep(2 * time.Second) time.Sleep(2 * time.Second)
// Issue a tidy-status on /pki
{
tidyStatus, err := client.Logical().Read("pki/tidy-status")
if err != nil {
t.Fatal(err)
}
expectedData := map[string]interface{}{
"safety_buffer": json.Number("1"),
"tidy_cert_store": true,
"tidy_revoked_certs": true,
"state": "Finished",
"error": nil,
"time_started": nil,
"time_finished": nil,
"message": nil,
"cert_store_deleted_count": json.Number("1"),
"revoked_cert_deleted_count": json.Number("1"),
}
// Let's copy the times from the response so that we can use deep.Equal()
timeStarted, ok := tidyStatus.Data["time_started"]
if !ok || timeStarted == "" {
t.Fatal("Expected tidy status response to include a value for time_started")
}
expectedData["time_started"] = timeStarted
timeFinished, ok := tidyStatus.Data["time_finished"]
if !ok || timeFinished == "" {
t.Fatal("Expected tidy status response to include a value for time_finished")
}
expectedData["time_finished"] = timeFinished
if diff := deep.Equal(expectedData, tidyStatus.Data); diff != nil {
t.Fatal(diff)
}
}
// Check the tidy metrics
{
// Map of gagues to expected value
expectedGauges := map[string]float32{
"secrets.pki.tidy.cert_store_current_entry": 0,
"secrets.pki.tidy.cert_store_total_entries": 1,
"secrets.pki.tidy.revoked_cert_current_entry": 0,
"secrets.pki.tidy.revoked_cert_total_entries": 1,
"secrets.pki.tidy.start_time_epoch": 0,
}
// Map of counters to the sum of the metrics for that counter
expectedCounters := map[string]float64{
"secrets.pki.tidy.cert_store_deleted_count": 1,
"secrets.pki.tidy.revoked_cert_deleted_count": 1,
"secrets.pki.tidy.success": 2,
// Note that "secrets.pki.tidy.failure" won't be in the captured metrics
}
// If the metrics span mnore than one interval, skip the checks
intervals := inmemSink.Data()
if len(intervals) == 1 {
interval := inmemSink.Data()[0]
for gauge, value := range expectedGauges {
if _, ok := interval.Gauges[gauge]; !ok {
t.Fatalf("Expected metrics to include a value for gauge %s", gauge)
}
if value != interval.Gauges[gauge].Value {
t.Fatalf("Expected value metric %s to be %f but got %f", gauge, value, interval.Gauges[gauge].Value)
}
}
for counter, value := range expectedCounters {
if _, ok := interval.Counters[counter]; !ok {
t.Fatalf("Expected metrics to include a value for couter %s", counter)
}
if value != interval.Counters[counter].Sum {
t.Fatalf("Expected the sum of metric %s to be %f but got %f", counter, value, interval.Counters[counter].Sum)
}
}
tidyDuration, ok := interval.Samples["secrets.pki.tidy.duration"]
if !ok {
t.Fatal("Expected metrics to include a value for sample secrets.pki.tidy.duration")
}
if tidyDuration.Count <= 0 {
t.Fatalf("Expected metrics to have count > 0 for sample secrets.pki.tidy.duration, but got %d", tidyDuration.Count)
}
}
}
req = client.NewRequest("GET", "/v1/pki/crl") req = client.NewRequest("GET", "/v1/pki/crl")
resp, err = client.RawRequest(req) resp, err = client.RawRequest(req)
if err != nil { if err != nil {

View File

@ -558,6 +558,32 @@ func runSteps(t *testing.T, rootB, intB *backend, client *api.Client, rootName,
} }
} }
verifyTidyStatus := func(expectedCertStoreDeleteCount int, expectedRevokedCertDeletedCount int) {
tidyStatus, err := client.Logical().Read(rootName+"tidy-status")
if err != nil {
t.Fatal(err)
}
if tidyStatus.Data["state"] != "Finished" {
t.Fatalf("Expected tidy operation to be finished, but tidy-status reports its state is %v", tidyStatus.Data)
}
var count int64
if count, err = tidyStatus.Data["cert_store_deleted_count"].(json.Number).Int64(); err != nil {
t.Fatal(err)
}
if int64(expectedCertStoreDeleteCount) != count {
t.Fatalf("Expected %d for cert_store_deleted_count, but got %d", expectedCertStoreDeleteCount, count)
}
if count, err = tidyStatus.Data["revoked_cert_deleted_count"].(json.Number).Int64(); err != nil {
t.Fatal(err)
}
if int64(expectedRevokedCertDeletedCount) != count {
t.Fatalf("Expected %d for revoked_cert_deleted_count, but got %d", expectedRevokedCertDeletedCount, count)
}
}
// Validate current state of revoked certificates // Validate current state of revoked certificates
verifyRevocation(t, intSerialNumber, true) verifyRevocation(t, intSerialNumber, true)
@ -585,6 +611,8 @@ func runSteps(t *testing.T, rootB, intB *backend, client *api.Client, rootName,
// Check to make sure we still find the cert and see it on the CRL // Check to make sure we still find the cert and see it on the CRL
verifyRevocation(t, intSerialNumber, true) verifyRevocation(t, intSerialNumber, true)
verifyTidyStatus(0, 0)
} }
// Run with both values set false, nothing should happen // Run with both values set false, nothing should happen
@ -606,6 +634,8 @@ func runSteps(t *testing.T, rootB, intB *backend, client *api.Client, rootName,
// Check to make sure we still find the cert and see it on the CRL // Check to make sure we still find the cert and see it on the CRL
verifyRevocation(t, intSerialNumber, true) verifyRevocation(t, intSerialNumber, true)
verifyTidyStatus(0, 0)
} }
// Run with a short safety buffer and both set to true, both should be cleared // Run with a short safety buffer and both set to true, both should be cleared
@ -627,6 +657,9 @@ func runSteps(t *testing.T, rootB, intB *backend, client *api.Client, rootName,
// Check to make sure we still find the cert and see it on the CRL // Check to make sure we still find the cert and see it on the CRL
verifyRevocation(t, intSerialNumber, false) verifyRevocation(t, intSerialNumber, false)
verifyTidyStatus(1, 1)
} }
} }
} }

View File

@ -8,6 +8,7 @@ import (
"sync/atomic" "sync/atomic"
"time" "time"
"github.com/armon/go-metrics"
"github.com/hashicorp/vault/sdk/framework" "github.com/hashicorp/vault/sdk/framework"
"github.com/hashicorp/vault/sdk/helper/consts" "github.com/hashicorp/vault/sdk/helper/consts"
"github.com/hashicorp/vault/sdk/logical" "github.com/hashicorp/vault/sdk/logical"
@ -15,7 +16,7 @@ import (
func pathTidy(b *backend) *framework.Path { func pathTidy(b *backend) *framework.Path {
return &framework.Path{ return &framework.Path{
Pattern: "tidy", Pattern: "tidy$",
Fields: map[string]*framework.FieldSchema{ Fields: map[string]*framework.FieldSchema{
"tidy_cert_store": { "tidy_cert_store": {
Type: framework.TypeBool, Type: framework.TypeBool,
@ -45,8 +46,11 @@ Defaults to 72 hours.`,
}, },
}, },
Callbacks: map[logical.Operation]framework.OperationFunc{ Operations: map[logical.Operation]framework.OperationHandler{
logical.UpdateOperation: b.pathTidyWrite, logical.UpdateOperation: &framework.PathOperation{
Callback: b.pathTidyWrite,
ForwardPerformanceStandby: true,
},
}, },
HelpSynopsis: pathTidyHelpSyn, HelpSynopsis: pathTidyHelpSyn,
@ -54,12 +58,21 @@ Defaults to 72 hours.`,
} }
} }
func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *framework.FieldData) (*logical.Response, error) { func pathTidyStatus(b *backend) *framework.Path {
// If we are a performance standby forward the request to the active node return &framework.Path{
if b.System().ReplicationState().HasState(consts.ReplicationPerformanceStandby) { Pattern: "tidy-status$",
return nil, logical.ErrReadOnly Operations: map[logical.Operation]framework.OperationHandler{
logical.ReadOperation: &framework.PathOperation{
Callback: b.pathTidyStatusRead,
ForwardPerformanceStandby: true,
},
},
HelpSynopsis: pathTidyStatusHelpSyn,
HelpDescription: pathTidyStatusHelpDesc,
} }
}
func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *framework.FieldData) (*logical.Response, error) {
safetyBuffer := d.Get("safety_buffer").(int) safetyBuffer := d.Get("safety_buffer").(int)
tidyCertStore := d.Get("tidy_cert_store").(bool) tidyCertStore := d.Get("tidy_cert_store").(bool)
tidyRevokedCerts := d.Get("tidy_revoked_certs").(bool) tidyRevokedCerts := d.Get("tidy_revoked_certs").(bool)
@ -86,6 +99,8 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
go func() { go func() {
defer atomic.StoreUint32(b.tidyCASGuard, 0) defer atomic.StoreUint32(b.tidyCASGuard, 0)
b.tidyStatusStart(safetyBuffer, tidyCertStore, tidyRevokedCerts || tidyRevocationList)
// Don't cancel when the original client request goes away // Don't cancel when the original client request goes away
ctx = context.Background() ctx = context.Background()
@ -98,7 +113,12 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
return fmt.Errorf("error fetching list of certs: %w", err) return fmt.Errorf("error fetching list of certs: %w", err)
} }
for _, serial := range serials { serialCount := len(serials)
metrics.SetGauge([]string{"secrets", "pki", "tidy", "cert_store_total_entries"}, float32(serialCount))
for i, serial := range serials {
b.tidyStatusMessage(fmt.Sprintf("Tidying certificate store: checking entry %d of %d", i, serialCount))
metrics.SetGauge([]string{"secrets", "pki", "tidy", "cert_store_current_entry"}, float32(i))
certEntry, err := req.Storage.Get(ctx, "certs/"+serial) certEntry, err := req.Storage.Get(ctx, "certs/"+serial)
if err != nil { if err != nil {
return fmt.Errorf("error fetching certificate %q: %w", serial, err) return fmt.Errorf("error fetching certificate %q: %w", serial, err)
@ -109,6 +129,7 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
if err := req.Storage.Delete(ctx, "certs/"+serial); err != nil { if err := req.Storage.Delete(ctx, "certs/"+serial); err != nil {
return fmt.Errorf("error deleting nil entry with serial %s: %w", serial, err) return fmt.Errorf("error deleting nil entry with serial %s: %w", serial, err)
} }
b.tidyStatusIncCertStoreCount()
continue continue
} }
@ -117,6 +138,7 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
if err := req.Storage.Delete(ctx, "certs/"+serial); err != nil { if err := req.Storage.Delete(ctx, "certs/"+serial); err != nil {
return fmt.Errorf("error deleting entry with nil value with serial %s: %w", serial, err) return fmt.Errorf("error deleting entry with nil value with serial %s: %w", serial, err)
} }
b.tidyStatusIncCertStoreCount()
continue continue
} }
@ -129,6 +151,7 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
if err := req.Storage.Delete(ctx, "certs/"+serial); err != nil { if err := req.Storage.Delete(ctx, "certs/"+serial); err != nil {
return fmt.Errorf("error deleting serial %q from storage: %w", serial, err) return fmt.Errorf("error deleting serial %q from storage: %w", serial, err)
} }
b.tidyStatusIncCertStoreCount()
} }
} }
} }
@ -144,8 +167,14 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
return fmt.Errorf("error fetching list of revoked certs: %w", err) return fmt.Errorf("error fetching list of revoked certs: %w", err)
} }
revokedSerialsCount := len(revokedSerials)
metrics.SetGauge([]string{"secrets", "pki", "tidy", "revoked_cert_total_entries"}, float32(revokedSerialsCount))
var revInfo revocationInfo var revInfo revocationInfo
for _, serial := range revokedSerials { for i, serial := range revokedSerials {
b.tidyStatusMessage(fmt.Sprintf("Tidying revoked certificates: checking certificate %d of %d", i, len(revokedSerials)))
metrics.SetGauge([]string{"secrets", "pki", "tidy", "revoked_cert_current_entry"}, float32(i))
revokedEntry, err := req.Storage.Get(ctx, "revoked/"+serial) revokedEntry, err := req.Storage.Get(ctx, "revoked/"+serial)
if err != nil { if err != nil {
return fmt.Errorf("unable to fetch revoked cert with serial %q: %w", serial, err) return fmt.Errorf("unable to fetch revoked cert with serial %q: %w", serial, err)
@ -156,6 +185,7 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
if err := req.Storage.Delete(ctx, "revoked/"+serial); err != nil { if err := req.Storage.Delete(ctx, "revoked/"+serial); err != nil {
return fmt.Errorf("error deleting nil revoked entry with serial %s: %w", serial, err) return fmt.Errorf("error deleting nil revoked entry with serial %s: %w", serial, err)
} }
b.tidyStatusIncRevokedCertCount()
continue continue
} }
@ -164,6 +194,7 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
if err := req.Storage.Delete(ctx, "revoked/"+serial); err != nil { if err := req.Storage.Delete(ctx, "revoked/"+serial); err != nil {
return fmt.Errorf("error deleting revoked entry with nil value with serial %s: %w", serial, err) return fmt.Errorf("error deleting revoked entry with nil value with serial %s: %w", serial, err)
} }
b.tidyStatusIncRevokedCertCount()
continue continue
} }
@ -189,6 +220,7 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
return fmt.Errorf("error deleting serial %q from store when tidying revoked: %w", serial, err) return fmt.Errorf("error deleting serial %q from store when tidying revoked: %w", serial, err)
} }
rebuildCRL = true rebuildCRL = true
b.tidyStatusIncRevokedCertCount()
} }
} }
@ -204,7 +236,9 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
if err := doTidy(); err != nil { if err := doTidy(); err != nil {
logger.Error("error running tidy", "error", err) logger.Error("error running tidy", "error", err)
return b.tidyStatusStop(err)
} else {
b.tidyStatusStop(nil)
} }
}() }()
@ -213,6 +247,121 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
return logical.RespondWithStatusCode(resp, req, http.StatusAccepted) return logical.RespondWithStatusCode(resp, req, http.StatusAccepted)
} }
func (b *backend) pathTidyStatusRead(ctx context.Context, req *logical.Request, d *framework.FieldData) (*logical.Response, error) {
// If this node is a performance secondary return an ErrReadOnly so that the request gets forwarded,
// but only if the PKI backend is not a local mount.
if b.System().ReplicationState().HasState(consts.ReplicationPerformanceSecondary) && !b.System().LocalMount() {
return nil, logical.ErrReadOnly
}
b.tidyStatusLock.RLock()
defer b.tidyStatusLock.RUnlock()
resp := &logical.Response{
Data: map[string]interface{}{
"safety_buffer": nil,
"tidy_cert_store": nil,
"tidy_revoked_certs": nil,
"state": "Inactive",
"error": nil,
"time_started": nil,
"time_finished": nil,
"message": nil,
"cert_store_deleted_count": nil,
"revoked_cert_deleted_count": nil,
},
}
if b.tidyStatus.state == tidyStatusInactive {
return resp, nil
}
resp.Data["safety_buffer"] = b.tidyStatus.safetyBuffer
resp.Data["tidy_cert_store"] = b.tidyStatus.tidyCertStore
resp.Data["tidy_revoked_certs"] = b.tidyStatus.tidyRevokedCerts
resp.Data["time_started"] = b.tidyStatus.timeStarted
resp.Data["message"] = b.tidyStatus.message
resp.Data["cert_store_deleted_count"] = b.tidyStatus.certStoreDeletedCount
resp.Data["revoked_cert_deleted_count"] = b.tidyStatus.revokedCertDeletedCount
switch(b.tidyStatus.state) {
case tidyStatusStarted:
resp.Data["state"] = "Running"
case tidyStatusFinished:
resp.Data["state"] = "Finished"
resp.Data["time_finished"] = b.tidyStatus.timeFinished
resp.Data["message"] = nil
case tidyStatusError:
resp.Data["state"] = "Error"
resp.Data["time_finished"] = b.tidyStatus.timeFinished
resp.Data["error"] = b.tidyStatus.err.Error()
// Don't clear the message so that it serves as a hint about when
// the error ocurred.
}
return resp, nil
}
func (b *backend) tidyStatusStart(safetyBuffer int, tidyCertStore, tidyRevokedCerts bool) {
b.tidyStatusLock.Lock()
defer b.tidyStatusLock.Unlock()
b.tidyStatus = &tidyStatus{
safetyBuffer: safetyBuffer,
tidyCertStore: tidyCertStore,
tidyRevokedCerts: tidyRevokedCerts,
state: tidyStatusStarted,
timeStarted: time.Now(),
}
metrics.SetGauge([]string{"secrets", "pki", "tidy", "start_time_epoch"}, float32(b.tidyStatus.timeStarted.Unix()))
}
func (b *backend) tidyStatusStop(err error) {
b.tidyStatusLock.Lock()
defer b.tidyStatusLock.Unlock()
b.tidyStatus.timeFinished = time.Now()
b.tidyStatus.err = err
if err == nil {
b.tidyStatus.state = tidyStatusFinished
} else {
b.tidyStatus.state = tidyStatusError
}
metrics.MeasureSince([]string{"secrets", "pki", "tidy", "duration"}, b.tidyStatus.timeStarted)
metrics.SetGauge([]string{"secrets", "pki", "tidy", "start_time_epoch"}, 0)
metrics.IncrCounter([]string{"secrets", "pki", "tidy", "cert_store_deleted_count"}, float32(b.tidyStatus.certStoreDeletedCount))
metrics.IncrCounter([]string{"secrets", "pki", "tidy", "revoked_cert_deleted_count"}, float32(b.tidyStatus.revokedCertDeletedCount))
if err != nil {
metrics.IncrCounter([]string{"secrets", "pki", "tidy", "failure"}, 1)
} else {
metrics.IncrCounter([]string{"secrets", "pki", "tidy", "success"}, 1)
}
}
func (b *backend) tidyStatusMessage(msg string) {
b.tidyStatusLock.Lock()
defer b.tidyStatusLock.Unlock()
b.tidyStatus.message = msg
}
func (b *backend) tidyStatusIncCertStoreCount() {
b.tidyStatusLock.Lock()
defer b.tidyStatusLock.Unlock()
b.tidyStatus.certStoreDeletedCount++
}
func (b *backend) tidyStatusIncRevokedCertCount() {
b.tidyStatusLock.Lock()
defer b.tidyStatusLock.Unlock()
b.tidyStatus.revokedCertDeletedCount++
}
const pathTidyHelpSyn = ` const pathTidyHelpSyn = `
Tidy up the backend by removing expired certificates, revocation information, Tidy up the backend by removing expired certificates, revocation information,
or both. or both.
@ -239,3 +388,25 @@ certificate storage or in revocation information will then be checked. If the
current time, minus the value of 'safety_buffer', is greater than the current time, minus the value of 'safety_buffer', is greater than the
expiration, it will be removed. expiration, it will be removed.
` `
const pathTidyStatusHelpSyn = `
Returns the status of the tidy operation.
`
const pathTidyStatusHelpDesc = `
This is a read only endpoint that returns information about the current tidy
operation, or the most recent if none is currently running.
The result includes the following fields:
* 'safety_buffer': the value of this parameter when initiating the tidy operation
* 'tidy_cert_store': the value of this parameter when initiating the tidy operation
* 'tidy_revoked_certs': the value of this parameter when initiating the tidy operation
* 'state': one of "Inactive", "Running", "Finished", "Error"
* 'error': the error message, if the operation ran into an error
* 'time_started': the time the operation started
* 'time_finished': the time the operation finished
* 'message': One of "Tidying certificate store: checking entry N of TOTAL" or
"Tidying revoked certificates: checking certificate N of TOTAL"
* 'cert_store_deleted_count': The number of certificate storage entries deleted
* 'revoked_cert_deleted_count': The number of revoked certificate entries deleted
`

3
changelog/12885.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:feature:
secrets/pki: Add `tidy-status` endpoint to obtain information of the current or most recent tidy operation.
```

View File

@ -1606,6 +1606,55 @@ $ curl \
http://127.0.0.1:8200/v1/pki/tidy http://127.0.0.1:8200/v1/pki/tidy
``` ```
## Tidy Status
This is a read only endpoint that returns information about the current tidy
operation, or the most recent if none are currently running.
The result includes the following fields:
* `safety_buffer`: the value of this parameter when initiating the tidy operation
* `tidy_cert_store`: the value of this parameter when initiating the tidy operation
* `tidy_revoked_certs`: the value of this parameter when initiating the tidy operation
* `state`: one of *Inactive*, *Running*, *Finished*, *Error*
* `error`: the error message, if the operation ran into an error
* `time_started`: the time the operation started
* `time_finished`: the time the operation finished
* `message`: One of *Tidying certificate store: checking entry N of TOTAL* or
*Tidying revoked certificates: checking certificate N of TOTAL*
* `cert_store_deleted_count`: The number of certificate storage entries deleted
* `revoked_cert_deleted_count`: The number of revoked certificate entries deleted
| Method | Path |
| :----- | :----------------- |
| `GET` | `/pki/tidy-status` |
### Sample Request
```shell-session
$ curl \
--header "X-Vault-Token: ..." \
--request GET \
http://127.0.0.1:8200/v1/pki/tidy-status
```
### Sample Response
```json
"data": {
"safety_buffer": 60,
"tidy_cert_store": true,
"tidy_revoked_certs": true,
"error": null,
"message": "Tidying certificate store: checking entry 234 of 488",
"revoked_cert_deleted_count": 0,
"cert_store_deleted_count": 2,
"state": "Running",
"time_started": "2021-10-20T14:52:13.510161-04:00",
"time_finished": null
},
```
# Cluster Scalability # Cluster Scalability
Most non-introspection operations in the PKI secrets engine require a write to Most non-introspection operations in the PKI secrets engine require a write to

View File

@ -288,30 +288,40 @@ These metrics relate to [Vault Enterprise Replication](/docs/enterprise/replicat
These metrics relate to the supported [secrets engines][secrets-engines]. These metrics relate to the supported [secrets engines][secrets-engines].
| Metric | Description | Unit | Type | | Metric | Description | Unit | Type |
| :------------------------------------------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----- | :------ | | :------------------------------------------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----- | :------ |
| `database.Initialize` | Time taken to initialize a database secret engine across all database secrets engines | ms | summary | | `database.Initialize` | Time taken to initialize a database secret engine across all database secrets engines | ms | summary |
| `database.<name>.Initialize` | Time taken to initialize a database secret engine for the named database secrets engine `<name>`, for example: `database.postgresql-prod.Initialize` | ms | summary | | `database.<name>.Initialize` | Time taken to initialize a database secret engine for the named database secrets engine `<name>`, for example: `database.postgresql-prod.Initialize` | ms | summary |
| `database.Initialize.error` | Number of database secrets engine initialization operation errors across all database secrets engines | errors | counter | | `database.Initialize.error` | Number of database secrets engine initialization operation errors across all database secrets engines | errors | counter |
| `database.<name>.Initialize.error` | Number of database secrets engine initialization operation errors for the named database secrets engine `<name>`, for example: `database.postgresql-prod.Initialize.error` | errors | counter | | `database.<name>.Initialize.error` | Number of database secrets engine initialization operation errors for the named database secrets engine `<name>`, for example: `database.postgresql-prod.Initialize.error` | errors | counter |
| `database.Close` | Time taken to close a database secret engine across all database secrets engines | ms | summary | | `database.Close` | Time taken to close a database secret engine across all database secrets engines | ms | summary |
| `database.<name>.Close` | Time taken to close a database secret engine for the named database secrets engine `<name>`, for example: `database.postgresql-prod.Close` | ms | summary | | `database.<name>.Close` | Time taken to close a database secret engine for the named database secrets engine `<name>`, for example: `database.postgresql-prod.Close` | ms | summary |
| `database.Close.error` | Number of database secrets engine close operation errors across all database secrets engines | errors | counter | | `database.Close.error` | Number of database secrets engine close operation errors across all database secrets engines | errors | counter |
| `database.<name>.Close.error` | Number of database secrets engine close operation errors for the named database secrets engine `<name>`, for example: `database.postgresql-prod.Close.error` | errors | counter | | `database.<name>.Close.error` | Number of database secrets engine close operation errors for the named database secrets engine `<name>`, for example: `database.postgresql-prod.Close.error` | errors | counter |
| `database.CreateUser` | Time taken to create a user across all database secrets engines | ms | summary | | `database.CreateUser` | Time taken to create a user across all database secrets engines | ms | summary |
| `database.<name>.CreateUser` | Time taken to create a user for the named database secrets engine `<name>` | ms | summary | | `database.<name>.CreateUser` | Time taken to create a user for the named database secrets engine `<name>` | ms | summary |
| `database.CreateUser.error` | Number of user creation operation errors across all database secrets engines | errors | counter | | `database.CreateUser.error` | Number of user creation operation errors across all database secrets engines | errors | counter |
| `database.<name>.CreateUser.error` | Number of user creation operation errors for the named database secrets engine `<name>`, for example: `database.postgresql-prod.CreateUser.error` | errors | counter | | `database.<name>.CreateUser.error` | Number of user creation operation errors for the named database secrets engine `<name>`, for example: `database.postgresql-prod.CreateUser.error` | errors | counter |
| `database.RenewUser` | Time taken to renew a user across all database secrets engines | ms | summary | | `database.RenewUser` | Time taken to renew a user across all database secrets engines | ms | summary |
| `database.<name>.RenewUser` | Time taken to renew a user for the named database secrets engine `<name>`, for example: `database.postgresql-prod.RenewUser` | ms | summary | | `database.<name>.RenewUser` | Time taken to renew a user for the named database secrets engine `<name>`, for example: `database.postgresql-prod.RenewUser` | ms | summary |
| `database.RenewUser.error` | Number of user renewal operation errors across all database secrets engines | errors | counter | | `database.RenewUser.error` | Number of user renewal operation errors across all database secrets engines | errors | counter |
| `database.<name>.RenewUser.error` | Number of user renewal operations for the named database secrets engine `<name>`, for example: `database.postgresql-prod.RenewUser.error` | errors | counter | | `database.<name>.RenewUser.error` | Number of user renewal operations for the named database secrets engine `<name>`, for example: `database.postgresql-prod.RenewUser.error` | errors | counter |
| `database.RevokeUser` | Time taken to revoke a user across all database secrets engines | ms | summary | | `database.RevokeUser` | Time taken to revoke a user across all database secrets engines | ms | summary |
| `database.<name>.RevokeUser` | Time taken to revoke a user for the named database secrets engine `<name>`, for example: `database.postgresql-prod.RevokeUser` | ms | summary | | `database.<name>.RevokeUser` | Time taken to revoke a user for the named database secrets engine `<name>`, for example: `database.postgresql-prod.RevokeUser` | ms | summary |
| `database.RevokeUser.error` | Number of user revocation operation errors across all database secrets engines | errors | counter | | `database.RevokeUser.error` | Number of user revocation operation errors across all database secrets engines | errors | counter |
| `database.<name>.RevokeUser.error` | Number of user revocation operations for the named database secrets engine `<name>`, for example: `database.postgresql-prod.RevokeUser.error` | errors | counter | | `database.<name>.RevokeUser.error` | Number of user revocation operations for the named database secrets engine `<name>`, for example: `database.postgresql-prod.RevokeUser.error` | errors | counter |
| `vault.secret.kv.count` (cluster, namespace, mount_point) | Number of entries in each key-value secret engine. | paths | gauge | | `secrets.pki.tidy.cert_store_current_entry` | The index of the current entry in the certificate store being verified by the tidy operation | entry index | gauge |
| `vault.secret.lease.creation` (cluster, namespace, secret_engine, mount_point, creation_ttl) | Counts the number of leases created by secret engines. | leases | counter | | `secrets.pki.tidy.cert_store_deleted_count` | Number of entries deleted from the certificate store | entry | counter |
| `secrets.pki.tidy.cert_store_total_entries` | Number of entries in the certificate store to verify during the tidy operation | entry | gauge |
| `secrets.pki.tidy.duration` | Duration of time taken by the PKI tidy operation | ms | summary |
| `secrets.pki.tidy.failure` | Number of times the PKI tidy operation has not completed due to errors | operations | counter |
| `secrets.pki.tidy.revoked_cert_current_entry` | The index of the current revoked certificate entry in the certificate store being verified by the tidy operation | entry index | gauge |
| `secrets.pki.tidy.revoked_cert_deleted_count` | Number of entries deleted from the certificate store for revoked certificates | entry | counter |
| `secrets.pki.tidy.revoked_cert_total_entries` | Number of entries in the certificate store for revoked certificates to verify during the tidy operation | entry | gauge |
| `secrets.pki.tidy.start_time_epoch` | Start time (as seconds since Jan 1 1970) when the PKI tidy operation is active, 0 otherwise | seconds | gauge |
| `secrets.pki.tidy.success` | Number of times the PKI tidy operation has completed succcessfully | operations | counter |
| `vault.secret.kv.count` (cluster, namespace, mount_point) | Number of entries in each key-value secret engine. | paths | gauge |
| `vault.secret.lease.creation` (cluster, namespace, secret_engine, mount_point, creation_ttl) | Counts the number of leases created by secret engines. | leases | counter |
## Storage Backend Metrics ## Storage Backend Metrics