VAULT-444: Add PKI tidy-status endpoint. (#12885)
VAULT-444: Add PKI tidy-status endpoint. Add metrics so that the PKI tidy status can be monitored using telemetry as well. Co-authored-by: Steven Clark <steven.clark@hashicorp.com>
This commit is contained in:
parent
d37da52974
commit
f6e35369f0
|
@ -3,11 +3,11 @@ package pki
|
|||
import (
|
||||
"context"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/vault/sdk/framework"
|
||||
"github.com/hashicorp/vault/sdk/logical"
|
||||
"github.com/hashicorp/vault/vault"
|
||||
)
|
||||
|
||||
// Factory creates a new backend implementing the logical.Backend interface
|
||||
|
@ -75,6 +75,7 @@ func Backend(conf *logical.BackendConfig) *backend {
|
|||
pathFetchListCerts(&b),
|
||||
pathRevoke(&b),
|
||||
pathTidy(&b),
|
||||
pathTidyStatus(&b),
|
||||
},
|
||||
|
||||
Secrets: []*framework.Secret{
|
||||
|
@ -86,6 +87,7 @@ func Backend(conf *logical.BackendConfig) *backend {
|
|||
|
||||
b.crlLifetime = time.Hour * 72
|
||||
b.tidyCASGuard = new(uint32)
|
||||
b.tidyStatus = &tidyStatus{state: tidyStatusInactive}
|
||||
b.storage = conf.StorageView
|
||||
|
||||
return &b
|
||||
|
@ -96,8 +98,36 @@ type backend struct {
|
|||
|
||||
storage logical.Storage
|
||||
crlLifetime time.Duration
|
||||
revokeStorageLock sync.RWMutex
|
||||
revokeStorageLock vault.DeadlockRWMutex
|
||||
tidyCASGuard *uint32
|
||||
|
||||
tidyStatusLock vault.DeadlockRWMutex
|
||||
tidyStatus *tidyStatus
|
||||
}
|
||||
|
||||
type tidyStatusState int
|
||||
|
||||
const (
|
||||
tidyStatusInactive tidyStatusState = iota
|
||||
tidyStatusStarted
|
||||
tidyStatusFinished
|
||||
tidyStatusError
|
||||
)
|
||||
|
||||
type tidyStatus struct {
|
||||
// Parameters used to initiate the operation
|
||||
safetyBuffer int
|
||||
tidyCertStore bool
|
||||
tidyRevokedCerts bool
|
||||
|
||||
// Status
|
||||
state tidyStatusState
|
||||
err error
|
||||
timeStarted time.Time
|
||||
timeFinished time.Time
|
||||
message string
|
||||
certStoreDeletedCount uint
|
||||
revokedCertDeletedCount uint
|
||||
}
|
||||
|
||||
const backendHelp = `
|
||||
|
|
|
@ -12,6 +12,7 @@ import (
|
|||
"crypto/x509"
|
||||
"crypto/x509/pkix"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"encoding/pem"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
|
@ -29,6 +30,7 @@ import (
|
|||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/armon/go-metrics"
|
||||
"github.com/fatih/structs"
|
||||
"github.com/go-test/deep"
|
||||
"github.com/hashicorp/go-secure-stdlib/strutil"
|
||||
|
@ -3092,6 +3094,22 @@ func setCerts() {
|
|||
}
|
||||
|
||||
func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
|
||||
// Use a ridiculously long time to minimize the chance
|
||||
// that we have to deal with more than one interval.
|
||||
// InMemSink rounds down to an interval boundary rather than
|
||||
// starting one at the time of initialization.
|
||||
inmemSink := metrics.NewInmemSink(
|
||||
1000000*time.Hour,
|
||||
2000000*time.Hour)
|
||||
|
||||
metricsConf := metrics.DefaultConfig("")
|
||||
metricsConf.EnableHostname = false
|
||||
metricsConf.EnableHostnameLabel = false
|
||||
metricsConf.EnableServiceLabel = false
|
||||
metricsConf.EnableTypePrefix = false
|
||||
|
||||
metrics.NewGlobal(metricsConf, inmemSink)
|
||||
|
||||
// Enable PKI secret engine
|
||||
coreConfig := &vault.CoreConfig{
|
||||
LogicalBackends: map[string]logical.Factory{
|
||||
|
@ -3243,6 +3261,91 @@ func TestBackend_RevokePlusTidy_Intermediate(t *testing.T) {
|
|||
// Sleep a bit to make sure we're past the safety buffer
|
||||
time.Sleep(2 * time.Second)
|
||||
|
||||
// Issue a tidy-status on /pki
|
||||
{
|
||||
tidyStatus, err := client.Logical().Read("pki/tidy-status")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
expectedData := map[string]interface{}{
|
||||
"safety_buffer": json.Number("1"),
|
||||
"tidy_cert_store": true,
|
||||
"tidy_revoked_certs": true,
|
||||
"state": "Finished",
|
||||
"error": nil,
|
||||
"time_started": nil,
|
||||
"time_finished": nil,
|
||||
"message": nil,
|
||||
"cert_store_deleted_count": json.Number("1"),
|
||||
"revoked_cert_deleted_count": json.Number("1"),
|
||||
}
|
||||
// Let's copy the times from the response so that we can use deep.Equal()
|
||||
timeStarted, ok := tidyStatus.Data["time_started"]
|
||||
if !ok || timeStarted == "" {
|
||||
t.Fatal("Expected tidy status response to include a value for time_started")
|
||||
}
|
||||
expectedData["time_started"] = timeStarted
|
||||
timeFinished, ok := tidyStatus.Data["time_finished"]
|
||||
if !ok || timeFinished == "" {
|
||||
t.Fatal("Expected tidy status response to include a value for time_finished")
|
||||
}
|
||||
expectedData["time_finished"] = timeFinished
|
||||
|
||||
if diff := deep.Equal(expectedData, tidyStatus.Data); diff != nil {
|
||||
t.Fatal(diff)
|
||||
}
|
||||
}
|
||||
// Check the tidy metrics
|
||||
{
|
||||
// Map of gagues to expected value
|
||||
expectedGauges := map[string]float32{
|
||||
"secrets.pki.tidy.cert_store_current_entry": 0,
|
||||
"secrets.pki.tidy.cert_store_total_entries": 1,
|
||||
"secrets.pki.tidy.revoked_cert_current_entry": 0,
|
||||
"secrets.pki.tidy.revoked_cert_total_entries": 1,
|
||||
"secrets.pki.tidy.start_time_epoch": 0,
|
||||
}
|
||||
// Map of counters to the sum of the metrics for that counter
|
||||
expectedCounters := map[string]float64{
|
||||
"secrets.pki.tidy.cert_store_deleted_count": 1,
|
||||
"secrets.pki.tidy.revoked_cert_deleted_count": 1,
|
||||
"secrets.pki.tidy.success": 2,
|
||||
// Note that "secrets.pki.tidy.failure" won't be in the captured metrics
|
||||
}
|
||||
|
||||
// If the metrics span mnore than one interval, skip the checks
|
||||
intervals := inmemSink.Data()
|
||||
if len(intervals) == 1 {
|
||||
interval := inmemSink.Data()[0]
|
||||
|
||||
for gauge, value := range expectedGauges {
|
||||
if _, ok := interval.Gauges[gauge]; !ok {
|
||||
t.Fatalf("Expected metrics to include a value for gauge %s", gauge)
|
||||
}
|
||||
if value != interval.Gauges[gauge].Value {
|
||||
t.Fatalf("Expected value metric %s to be %f but got %f", gauge, value, interval.Gauges[gauge].Value)
|
||||
}
|
||||
|
||||
}
|
||||
for counter, value := range expectedCounters {
|
||||
if _, ok := interval.Counters[counter]; !ok {
|
||||
t.Fatalf("Expected metrics to include a value for couter %s", counter)
|
||||
}
|
||||
if value != interval.Counters[counter].Sum {
|
||||
t.Fatalf("Expected the sum of metric %s to be %f but got %f", counter, value, interval.Counters[counter].Sum)
|
||||
}
|
||||
}
|
||||
|
||||
tidyDuration, ok := interval.Samples["secrets.pki.tidy.duration"]
|
||||
if !ok {
|
||||
t.Fatal("Expected metrics to include a value for sample secrets.pki.tidy.duration")
|
||||
}
|
||||
if tidyDuration.Count <= 0 {
|
||||
t.Fatalf("Expected metrics to have count > 0 for sample secrets.pki.tidy.duration, but got %d", tidyDuration.Count)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
req = client.NewRequest("GET", "/v1/pki/crl")
|
||||
resp, err = client.RawRequest(req)
|
||||
if err != nil {
|
||||
|
|
|
@ -558,6 +558,32 @@ func runSteps(t *testing.T, rootB, intB *backend, client *api.Client, rootName,
|
|||
}
|
||||
}
|
||||
|
||||
verifyTidyStatus := func(expectedCertStoreDeleteCount int, expectedRevokedCertDeletedCount int) {
|
||||
tidyStatus, err := client.Logical().Read(rootName+"tidy-status")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if tidyStatus.Data["state"] != "Finished" {
|
||||
t.Fatalf("Expected tidy operation to be finished, but tidy-status reports its state is %v", tidyStatus.Data)
|
||||
}
|
||||
|
||||
var count int64
|
||||
if count, err = tidyStatus.Data["cert_store_deleted_count"].(json.Number).Int64(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if int64(expectedCertStoreDeleteCount) != count {
|
||||
t.Fatalf("Expected %d for cert_store_deleted_count, but got %d", expectedCertStoreDeleteCount, count)
|
||||
}
|
||||
|
||||
if count, err = tidyStatus.Data["revoked_cert_deleted_count"].(json.Number).Int64(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if int64(expectedRevokedCertDeletedCount) != count {
|
||||
t.Fatalf("Expected %d for revoked_cert_deleted_count, but got %d", expectedRevokedCertDeletedCount, count)
|
||||
}
|
||||
}
|
||||
|
||||
// Validate current state of revoked certificates
|
||||
verifyRevocation(t, intSerialNumber, true)
|
||||
|
||||
|
@ -585,6 +611,8 @@ func runSteps(t *testing.T, rootB, intB *backend, client *api.Client, rootName,
|
|||
|
||||
// Check to make sure we still find the cert and see it on the CRL
|
||||
verifyRevocation(t, intSerialNumber, true)
|
||||
|
||||
verifyTidyStatus(0, 0)
|
||||
}
|
||||
|
||||
// Run with both values set false, nothing should happen
|
||||
|
@ -606,6 +634,8 @@ func runSteps(t *testing.T, rootB, intB *backend, client *api.Client, rootName,
|
|||
|
||||
// Check to make sure we still find the cert and see it on the CRL
|
||||
verifyRevocation(t, intSerialNumber, true)
|
||||
|
||||
verifyTidyStatus(0, 0)
|
||||
}
|
||||
|
||||
// Run with a short safety buffer and both set to true, both should be cleared
|
||||
|
@ -627,6 +657,9 @@ func runSteps(t *testing.T, rootB, intB *backend, client *api.Client, rootName,
|
|||
|
||||
// Check to make sure we still find the cert and see it on the CRL
|
||||
verifyRevocation(t, intSerialNumber, false)
|
||||
|
||||
verifyTidyStatus(1, 1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -8,6 +8,7 @@ import (
|
|||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/armon/go-metrics"
|
||||
"github.com/hashicorp/vault/sdk/framework"
|
||||
"github.com/hashicorp/vault/sdk/helper/consts"
|
||||
"github.com/hashicorp/vault/sdk/logical"
|
||||
|
@ -15,7 +16,7 @@ import (
|
|||
|
||||
func pathTidy(b *backend) *framework.Path {
|
||||
return &framework.Path{
|
||||
Pattern: "tidy",
|
||||
Pattern: "tidy$",
|
||||
Fields: map[string]*framework.FieldSchema{
|
||||
"tidy_cert_store": {
|
||||
Type: framework.TypeBool,
|
||||
|
@ -45,8 +46,11 @@ Defaults to 72 hours.`,
|
|||
},
|
||||
},
|
||||
|
||||
Callbacks: map[logical.Operation]framework.OperationFunc{
|
||||
logical.UpdateOperation: b.pathTidyWrite,
|
||||
Operations: map[logical.Operation]framework.OperationHandler{
|
||||
logical.UpdateOperation: &framework.PathOperation{
|
||||
Callback: b.pathTidyWrite,
|
||||
ForwardPerformanceStandby: true,
|
||||
},
|
||||
},
|
||||
|
||||
HelpSynopsis: pathTidyHelpSyn,
|
||||
|
@ -54,12 +58,21 @@ Defaults to 72 hours.`,
|
|||
}
|
||||
}
|
||||
|
||||
func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *framework.FieldData) (*logical.Response, error) {
|
||||
// If we are a performance standby forward the request to the active node
|
||||
if b.System().ReplicationState().HasState(consts.ReplicationPerformanceStandby) {
|
||||
return nil, logical.ErrReadOnly
|
||||
func pathTidyStatus(b *backend) *framework.Path {
|
||||
return &framework.Path{
|
||||
Pattern: "tidy-status$",
|
||||
Operations: map[logical.Operation]framework.OperationHandler{
|
||||
logical.ReadOperation: &framework.PathOperation{
|
||||
Callback: b.pathTidyStatusRead,
|
||||
ForwardPerformanceStandby: true,
|
||||
},
|
||||
},
|
||||
HelpSynopsis: pathTidyStatusHelpSyn,
|
||||
HelpDescription: pathTidyStatusHelpDesc,
|
||||
}
|
||||
}
|
||||
|
||||
func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *framework.FieldData) (*logical.Response, error) {
|
||||
safetyBuffer := d.Get("safety_buffer").(int)
|
||||
tidyCertStore := d.Get("tidy_cert_store").(bool)
|
||||
tidyRevokedCerts := d.Get("tidy_revoked_certs").(bool)
|
||||
|
@ -86,6 +99,8 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
|
|||
go func() {
|
||||
defer atomic.StoreUint32(b.tidyCASGuard, 0)
|
||||
|
||||
b.tidyStatusStart(safetyBuffer, tidyCertStore, tidyRevokedCerts || tidyRevocationList)
|
||||
|
||||
// Don't cancel when the original client request goes away
|
||||
ctx = context.Background()
|
||||
|
||||
|
@ -98,7 +113,12 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
|
|||
return fmt.Errorf("error fetching list of certs: %w", err)
|
||||
}
|
||||
|
||||
for _, serial := range serials {
|
||||
serialCount := len(serials)
|
||||
metrics.SetGauge([]string{"secrets", "pki", "tidy", "cert_store_total_entries"}, float32(serialCount))
|
||||
for i, serial := range serials {
|
||||
b.tidyStatusMessage(fmt.Sprintf("Tidying certificate store: checking entry %d of %d", i, serialCount))
|
||||
metrics.SetGauge([]string{"secrets", "pki", "tidy", "cert_store_current_entry"}, float32(i))
|
||||
|
||||
certEntry, err := req.Storage.Get(ctx, "certs/"+serial)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error fetching certificate %q: %w", serial, err)
|
||||
|
@ -109,6 +129,7 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
|
|||
if err := req.Storage.Delete(ctx, "certs/"+serial); err != nil {
|
||||
return fmt.Errorf("error deleting nil entry with serial %s: %w", serial, err)
|
||||
}
|
||||
b.tidyStatusIncCertStoreCount()
|
||||
continue
|
||||
}
|
||||
|
||||
|
@ -117,6 +138,7 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
|
|||
if err := req.Storage.Delete(ctx, "certs/"+serial); err != nil {
|
||||
return fmt.Errorf("error deleting entry with nil value with serial %s: %w", serial, err)
|
||||
}
|
||||
b.tidyStatusIncCertStoreCount()
|
||||
continue
|
||||
}
|
||||
|
||||
|
@ -129,6 +151,7 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
|
|||
if err := req.Storage.Delete(ctx, "certs/"+serial); err != nil {
|
||||
return fmt.Errorf("error deleting serial %q from storage: %w", serial, err)
|
||||
}
|
||||
b.tidyStatusIncCertStoreCount()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -144,8 +167,14 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
|
|||
return fmt.Errorf("error fetching list of revoked certs: %w", err)
|
||||
}
|
||||
|
||||
revokedSerialsCount := len(revokedSerials)
|
||||
metrics.SetGauge([]string{"secrets", "pki", "tidy", "revoked_cert_total_entries"}, float32(revokedSerialsCount))
|
||||
|
||||
var revInfo revocationInfo
|
||||
for _, serial := range revokedSerials {
|
||||
for i, serial := range revokedSerials {
|
||||
b.tidyStatusMessage(fmt.Sprintf("Tidying revoked certificates: checking certificate %d of %d", i, len(revokedSerials)))
|
||||
metrics.SetGauge([]string{"secrets", "pki", "tidy", "revoked_cert_current_entry"}, float32(i))
|
||||
|
||||
revokedEntry, err := req.Storage.Get(ctx, "revoked/"+serial)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to fetch revoked cert with serial %q: %w", serial, err)
|
||||
|
@ -156,6 +185,7 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
|
|||
if err := req.Storage.Delete(ctx, "revoked/"+serial); err != nil {
|
||||
return fmt.Errorf("error deleting nil revoked entry with serial %s: %w", serial, err)
|
||||
}
|
||||
b.tidyStatusIncRevokedCertCount()
|
||||
continue
|
||||
}
|
||||
|
||||
|
@ -164,6 +194,7 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
|
|||
if err := req.Storage.Delete(ctx, "revoked/"+serial); err != nil {
|
||||
return fmt.Errorf("error deleting revoked entry with nil value with serial %s: %w", serial, err)
|
||||
}
|
||||
b.tidyStatusIncRevokedCertCount()
|
||||
continue
|
||||
}
|
||||
|
||||
|
@ -189,6 +220,7 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
|
|||
return fmt.Errorf("error deleting serial %q from store when tidying revoked: %w", serial, err)
|
||||
}
|
||||
rebuildCRL = true
|
||||
b.tidyStatusIncRevokedCertCount()
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -204,7 +236,9 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
|
|||
|
||||
if err := doTidy(); err != nil {
|
||||
logger.Error("error running tidy", "error", err)
|
||||
return
|
||||
b.tidyStatusStop(err)
|
||||
} else {
|
||||
b.tidyStatusStop(nil)
|
||||
}
|
||||
}()
|
||||
|
||||
|
@ -213,6 +247,121 @@ func (b *backend) pathTidyWrite(ctx context.Context, req *logical.Request, d *fr
|
|||
return logical.RespondWithStatusCode(resp, req, http.StatusAccepted)
|
||||
}
|
||||
|
||||
func (b *backend) pathTidyStatusRead(ctx context.Context, req *logical.Request, d *framework.FieldData) (*logical.Response, error) {
|
||||
// If this node is a performance secondary return an ErrReadOnly so that the request gets forwarded,
|
||||
// but only if the PKI backend is not a local mount.
|
||||
if b.System().ReplicationState().HasState(consts.ReplicationPerformanceSecondary) && !b.System().LocalMount() {
|
||||
return nil, logical.ErrReadOnly
|
||||
}
|
||||
|
||||
b.tidyStatusLock.RLock()
|
||||
defer b.tidyStatusLock.RUnlock()
|
||||
|
||||
resp := &logical.Response{
|
||||
Data: map[string]interface{}{
|
||||
"safety_buffer": nil,
|
||||
"tidy_cert_store": nil,
|
||||
"tidy_revoked_certs": nil,
|
||||
"state": "Inactive",
|
||||
"error": nil,
|
||||
"time_started": nil,
|
||||
"time_finished": nil,
|
||||
"message": nil,
|
||||
"cert_store_deleted_count": nil,
|
||||
"revoked_cert_deleted_count": nil,
|
||||
},
|
||||
}
|
||||
|
||||
if b.tidyStatus.state == tidyStatusInactive {
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
resp.Data["safety_buffer"] = b.tidyStatus.safetyBuffer
|
||||
resp.Data["tidy_cert_store"] = b.tidyStatus.tidyCertStore
|
||||
resp.Data["tidy_revoked_certs"] = b.tidyStatus.tidyRevokedCerts
|
||||
resp.Data["time_started"] = b.tidyStatus.timeStarted
|
||||
resp.Data["message"] = b.tidyStatus.message
|
||||
resp.Data["cert_store_deleted_count"] = b.tidyStatus.certStoreDeletedCount
|
||||
resp.Data["revoked_cert_deleted_count"] = b.tidyStatus.revokedCertDeletedCount
|
||||
|
||||
switch(b.tidyStatus.state) {
|
||||
case tidyStatusStarted:
|
||||
resp.Data["state"] = "Running"
|
||||
case tidyStatusFinished:
|
||||
resp.Data["state"] = "Finished"
|
||||
resp.Data["time_finished"] = b.tidyStatus.timeFinished
|
||||
resp.Data["message"] = nil
|
||||
case tidyStatusError:
|
||||
resp.Data["state"] = "Error"
|
||||
resp.Data["time_finished"] = b.tidyStatus.timeFinished
|
||||
resp.Data["error"] = b.tidyStatus.err.Error()
|
||||
// Don't clear the message so that it serves as a hint about when
|
||||
// the error ocurred.
|
||||
}
|
||||
|
||||
return resp, nil
|
||||
}
|
||||
|
||||
func (b *backend) tidyStatusStart(safetyBuffer int, tidyCertStore, tidyRevokedCerts bool) {
|
||||
b.tidyStatusLock.Lock()
|
||||
defer b.tidyStatusLock.Unlock()
|
||||
|
||||
b.tidyStatus = &tidyStatus{
|
||||
safetyBuffer: safetyBuffer,
|
||||
tidyCertStore: tidyCertStore,
|
||||
tidyRevokedCerts: tidyRevokedCerts,
|
||||
state: tidyStatusStarted,
|
||||
timeStarted: time.Now(),
|
||||
}
|
||||
|
||||
metrics.SetGauge([]string{"secrets", "pki", "tidy", "start_time_epoch"}, float32(b.tidyStatus.timeStarted.Unix()))
|
||||
}
|
||||
|
||||
func (b *backend) tidyStatusStop(err error) {
|
||||
b.tidyStatusLock.Lock()
|
||||
defer b.tidyStatusLock.Unlock()
|
||||
|
||||
b.tidyStatus.timeFinished = time.Now()
|
||||
b.tidyStatus.err = err
|
||||
if err == nil {
|
||||
b.tidyStatus.state = tidyStatusFinished
|
||||
} else {
|
||||
b.tidyStatus.state = tidyStatusError
|
||||
}
|
||||
|
||||
metrics.MeasureSince([]string{"secrets", "pki", "tidy", "duration"}, b.tidyStatus.timeStarted)
|
||||
metrics.SetGauge([]string{"secrets", "pki", "tidy", "start_time_epoch"}, 0)
|
||||
metrics.IncrCounter([]string{"secrets", "pki", "tidy", "cert_store_deleted_count"}, float32(b.tidyStatus.certStoreDeletedCount))
|
||||
metrics.IncrCounter([]string{"secrets", "pki", "tidy", "revoked_cert_deleted_count"}, float32(b.tidyStatus.revokedCertDeletedCount))
|
||||
|
||||
if err != nil {
|
||||
metrics.IncrCounter([]string{"secrets", "pki", "tidy", "failure"}, 1)
|
||||
} else {
|
||||
metrics.IncrCounter([]string{"secrets", "pki", "tidy", "success"}, 1)
|
||||
}
|
||||
}
|
||||
|
||||
func (b *backend) tidyStatusMessage(msg string) {
|
||||
b.tidyStatusLock.Lock()
|
||||
defer b.tidyStatusLock.Unlock()
|
||||
|
||||
b.tidyStatus.message = msg
|
||||
}
|
||||
|
||||
func (b *backend) tidyStatusIncCertStoreCount() {
|
||||
b.tidyStatusLock.Lock()
|
||||
defer b.tidyStatusLock.Unlock()
|
||||
|
||||
b.tidyStatus.certStoreDeletedCount++
|
||||
}
|
||||
|
||||
func (b *backend) tidyStatusIncRevokedCertCount() {
|
||||
b.tidyStatusLock.Lock()
|
||||
defer b.tidyStatusLock.Unlock()
|
||||
|
||||
b.tidyStatus.revokedCertDeletedCount++
|
||||
}
|
||||
|
||||
const pathTidyHelpSyn = `
|
||||
Tidy up the backend by removing expired certificates, revocation information,
|
||||
or both.
|
||||
|
@ -239,3 +388,25 @@ certificate storage or in revocation information will then be checked. If the
|
|||
current time, minus the value of 'safety_buffer', is greater than the
|
||||
expiration, it will be removed.
|
||||
`
|
||||
|
||||
const pathTidyStatusHelpSyn = `
|
||||
Returns the status of the tidy operation.
|
||||
`
|
||||
|
||||
const pathTidyStatusHelpDesc = `
|
||||
This is a read only endpoint that returns information about the current tidy
|
||||
operation, or the most recent if none is currently running.
|
||||
|
||||
The result includes the following fields:
|
||||
* 'safety_buffer': the value of this parameter when initiating the tidy operation
|
||||
* 'tidy_cert_store': the value of this parameter when initiating the tidy operation
|
||||
* 'tidy_revoked_certs': the value of this parameter when initiating the tidy operation
|
||||
* 'state': one of "Inactive", "Running", "Finished", "Error"
|
||||
* 'error': the error message, if the operation ran into an error
|
||||
* 'time_started': the time the operation started
|
||||
* 'time_finished': the time the operation finished
|
||||
* 'message': One of "Tidying certificate store: checking entry N of TOTAL" or
|
||||
"Tidying revoked certificates: checking certificate N of TOTAL"
|
||||
* 'cert_store_deleted_count': The number of certificate storage entries deleted
|
||||
* 'revoked_cert_deleted_count': The number of revoked certificate entries deleted
|
||||
`
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
```release-note:feature:
|
||||
secrets/pki: Add `tidy-status` endpoint to obtain information of the current or most recent tidy operation.
|
||||
```
|
|
@ -1606,6 +1606,55 @@ $ curl \
|
|||
http://127.0.0.1:8200/v1/pki/tidy
|
||||
```
|
||||
|
||||
## Tidy Status
|
||||
|
||||
This is a read only endpoint that returns information about the current tidy
|
||||
operation, or the most recent if none are currently running.
|
||||
|
||||
The result includes the following fields:
|
||||
* `safety_buffer`: the value of this parameter when initiating the tidy operation
|
||||
* `tidy_cert_store`: the value of this parameter when initiating the tidy operation
|
||||
* `tidy_revoked_certs`: the value of this parameter when initiating the tidy operation
|
||||
* `state`: one of *Inactive*, *Running*, *Finished*, *Error*
|
||||
* `error`: the error message, if the operation ran into an error
|
||||
* `time_started`: the time the operation started
|
||||
* `time_finished`: the time the operation finished
|
||||
* `message`: One of *Tidying certificate store: checking entry N of TOTAL* or
|
||||
*Tidying revoked certificates: checking certificate N of TOTAL*
|
||||
* `cert_store_deleted_count`: The number of certificate storage entries deleted
|
||||
* `revoked_cert_deleted_count`: The number of revoked certificate entries deleted
|
||||
|
||||
| Method | Path |
|
||||
| :----- | :----------------- |
|
||||
| `GET` | `/pki/tidy-status` |
|
||||
|
||||
### Sample Request
|
||||
|
||||
```shell-session
|
||||
$ curl \
|
||||
--header "X-Vault-Token: ..." \
|
||||
--request GET \
|
||||
http://127.0.0.1:8200/v1/pki/tidy-status
|
||||
|
||||
```
|
||||
|
||||
### Sample Response
|
||||
|
||||
```json
|
||||
"data": {
|
||||
"safety_buffer": 60,
|
||||
"tidy_cert_store": true,
|
||||
"tidy_revoked_certs": true,
|
||||
"error": null,
|
||||
"message": "Tidying certificate store: checking entry 234 of 488",
|
||||
"revoked_cert_deleted_count": 0,
|
||||
"cert_store_deleted_count": 2,
|
||||
"state": "Running",
|
||||
"time_started": "2021-10-20T14:52:13.510161-04:00",
|
||||
"time_finished": null
|
||||
},
|
||||
```
|
||||
|
||||
# Cluster Scalability
|
||||
|
||||
Most non-introspection operations in the PKI secrets engine require a write to
|
||||
|
|
|
@ -288,30 +288,40 @@ These metrics relate to [Vault Enterprise Replication](/docs/enterprise/replicat
|
|||
|
||||
These metrics relate to the supported [secrets engines][secrets-engines].
|
||||
|
||||
| Metric | Description | Unit | Type |
|
||||
| :------------------------------------------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----- | :------ |
|
||||
| `database.Initialize` | Time taken to initialize a database secret engine across all database secrets engines | ms | summary |
|
||||
| `database.<name>.Initialize` | Time taken to initialize a database secret engine for the named database secrets engine `<name>`, for example: `database.postgresql-prod.Initialize` | ms | summary |
|
||||
| `database.Initialize.error` | Number of database secrets engine initialization operation errors across all database secrets engines | errors | counter |
|
||||
| `database.<name>.Initialize.error` | Number of database secrets engine initialization operation errors for the named database secrets engine `<name>`, for example: `database.postgresql-prod.Initialize.error` | errors | counter |
|
||||
| `database.Close` | Time taken to close a database secret engine across all database secrets engines | ms | summary |
|
||||
| `database.<name>.Close` | Time taken to close a database secret engine for the named database secrets engine `<name>`, for example: `database.postgresql-prod.Close` | ms | summary |
|
||||
| `database.Close.error` | Number of database secrets engine close operation errors across all database secrets engines | errors | counter |
|
||||
| `database.<name>.Close.error` | Number of database secrets engine close operation errors for the named database secrets engine `<name>`, for example: `database.postgresql-prod.Close.error` | errors | counter |
|
||||
| `database.CreateUser` | Time taken to create a user across all database secrets engines | ms | summary |
|
||||
| `database.<name>.CreateUser` | Time taken to create a user for the named database secrets engine `<name>` | ms | summary |
|
||||
| `database.CreateUser.error` | Number of user creation operation errors across all database secrets engines | errors | counter |
|
||||
| `database.<name>.CreateUser.error` | Number of user creation operation errors for the named database secrets engine `<name>`, for example: `database.postgresql-prod.CreateUser.error` | errors | counter |
|
||||
| `database.RenewUser` | Time taken to renew a user across all database secrets engines | ms | summary |
|
||||
| `database.<name>.RenewUser` | Time taken to renew a user for the named database secrets engine `<name>`, for example: `database.postgresql-prod.RenewUser` | ms | summary |
|
||||
| `database.RenewUser.error` | Number of user renewal operation errors across all database secrets engines | errors | counter |
|
||||
| `database.<name>.RenewUser.error` | Number of user renewal operations for the named database secrets engine `<name>`, for example: `database.postgresql-prod.RenewUser.error` | errors | counter |
|
||||
| `database.RevokeUser` | Time taken to revoke a user across all database secrets engines | ms | summary |
|
||||
| `database.<name>.RevokeUser` | Time taken to revoke a user for the named database secrets engine `<name>`, for example: `database.postgresql-prod.RevokeUser` | ms | summary |
|
||||
| `database.RevokeUser.error` | Number of user revocation operation errors across all database secrets engines | errors | counter |
|
||||
| `database.<name>.RevokeUser.error` | Number of user revocation operations for the named database secrets engine `<name>`, for example: `database.postgresql-prod.RevokeUser.error` | errors | counter |
|
||||
| `vault.secret.kv.count` (cluster, namespace, mount_point) | Number of entries in each key-value secret engine. | paths | gauge |
|
||||
| `vault.secret.lease.creation` (cluster, namespace, secret_engine, mount_point, creation_ttl) | Counts the number of leases created by secret engines. | leases | counter |
|
||||
| Metric | Description | Unit | Type |
|
||||
| :------------------------------------------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----- | :------ |
|
||||
| `database.Initialize` | Time taken to initialize a database secret engine across all database secrets engines | ms | summary |
|
||||
| `database.<name>.Initialize` | Time taken to initialize a database secret engine for the named database secrets engine `<name>`, for example: `database.postgresql-prod.Initialize` | ms | summary |
|
||||
| `database.Initialize.error` | Number of database secrets engine initialization operation errors across all database secrets engines | errors | counter |
|
||||
| `database.<name>.Initialize.error` | Number of database secrets engine initialization operation errors for the named database secrets engine `<name>`, for example: `database.postgresql-prod.Initialize.error` | errors | counter |
|
||||
| `database.Close` | Time taken to close a database secret engine across all database secrets engines | ms | summary |
|
||||
| `database.<name>.Close` | Time taken to close a database secret engine for the named database secrets engine `<name>`, for example: `database.postgresql-prod.Close` | ms | summary |
|
||||
| `database.Close.error` | Number of database secrets engine close operation errors across all database secrets engines | errors | counter |
|
||||
| `database.<name>.Close.error` | Number of database secrets engine close operation errors for the named database secrets engine `<name>`, for example: `database.postgresql-prod.Close.error` | errors | counter |
|
||||
| `database.CreateUser` | Time taken to create a user across all database secrets engines | ms | summary |
|
||||
| `database.<name>.CreateUser` | Time taken to create a user for the named database secrets engine `<name>` | ms | summary |
|
||||
| `database.CreateUser.error` | Number of user creation operation errors across all database secrets engines | errors | counter |
|
||||
| `database.<name>.CreateUser.error` | Number of user creation operation errors for the named database secrets engine `<name>`, for example: `database.postgresql-prod.CreateUser.error` | errors | counter |
|
||||
| `database.RenewUser` | Time taken to renew a user across all database secrets engines | ms | summary |
|
||||
| `database.<name>.RenewUser` | Time taken to renew a user for the named database secrets engine `<name>`, for example: `database.postgresql-prod.RenewUser` | ms | summary |
|
||||
| `database.RenewUser.error` | Number of user renewal operation errors across all database secrets engines | errors | counter |
|
||||
| `database.<name>.RenewUser.error` | Number of user renewal operations for the named database secrets engine `<name>`, for example: `database.postgresql-prod.RenewUser.error` | errors | counter |
|
||||
| `database.RevokeUser` | Time taken to revoke a user across all database secrets engines | ms | summary |
|
||||
| `database.<name>.RevokeUser` | Time taken to revoke a user for the named database secrets engine `<name>`, for example: `database.postgresql-prod.RevokeUser` | ms | summary |
|
||||
| `database.RevokeUser.error` | Number of user revocation operation errors across all database secrets engines | errors | counter |
|
||||
| `database.<name>.RevokeUser.error` | Number of user revocation operations for the named database secrets engine `<name>`, for example: `database.postgresql-prod.RevokeUser.error` | errors | counter |
|
||||
| `secrets.pki.tidy.cert_store_current_entry` | The index of the current entry in the certificate store being verified by the tidy operation | entry index | gauge |
|
||||
| `secrets.pki.tidy.cert_store_deleted_count` | Number of entries deleted from the certificate store | entry | counter |
|
||||
| `secrets.pki.tidy.cert_store_total_entries` | Number of entries in the certificate store to verify during the tidy operation | entry | gauge |
|
||||
| `secrets.pki.tidy.duration` | Duration of time taken by the PKI tidy operation | ms | summary |
|
||||
| `secrets.pki.tidy.failure` | Number of times the PKI tidy operation has not completed due to errors | operations | counter |
|
||||
| `secrets.pki.tidy.revoked_cert_current_entry` | The index of the current revoked certificate entry in the certificate store being verified by the tidy operation | entry index | gauge |
|
||||
| `secrets.pki.tidy.revoked_cert_deleted_count` | Number of entries deleted from the certificate store for revoked certificates | entry | counter |
|
||||
| `secrets.pki.tidy.revoked_cert_total_entries` | Number of entries in the certificate store for revoked certificates to verify during the tidy operation | entry | gauge |
|
||||
| `secrets.pki.tidy.start_time_epoch` | Start time (as seconds since Jan 1 1970) when the PKI tidy operation is active, 0 otherwise | seconds | gauge |
|
||||
| `secrets.pki.tidy.success` | Number of times the PKI tidy operation has completed succcessfully | operations | counter |
|
||||
| `vault.secret.kv.count` (cluster, namespace, mount_point) | Number of entries in each key-value secret engine. | paths | gauge |
|
||||
| `vault.secret.lease.creation` (cluster, namespace, secret_engine, mount_point, creation_ttl) | Counts the number of leases created by secret engines. | leases | counter |
|
||||
|
||||
## Storage Backend Metrics
|
||||
|
||||
|
|
Loading…
Reference in New Issue