open-vault/builtin/logical/pki/backend.go
Alexander Scheel b8576a8de4
Add ability to request manual rebuild of Delta CRLs (#16964)
* Add path to manually rebuild delta CRLs

The crl/rotate-delta path behaves like crl/rotate, triggering a
cluster-local rebuild of just the delta CRL. This is useful for when
delta CRLs are enabled with a longer-than-desired auto-rebuild period
after some high-profile revocations occur.

In the event delta CRLs are not enabled, this becomes a no-op.

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

* Add tests for Delta CRL rebuilding

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

* Update documentation about Delta CRLs

Also fixes a omission in the If-Modified-Since docs to mention that the
response header should probably also be passed through.

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>
2022-08-31 12:42:59 -07:00

530 lines
15 KiB
Go

package pki
import (
"context"
"fmt"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/hashicorp/vault/sdk/helper/consts"
"github.com/armon/go-metrics"
"github.com/hashicorp/vault/helper/metricsutil"
"github.com/hashicorp/vault/helper/namespace"
"github.com/hashicorp/vault/sdk/framework"
"github.com/hashicorp/vault/sdk/logical"
)
const (
noRole = 0
roleOptional = 1
roleRequired = 2
)
/*
* PKI requests are a bit special to keep up with the various failure and load issues.
*
* Any requests to write/delete shared data (such as roles, issuers, keys, and configuration)
* are always forwarded to the Primary cluster's active node to write and send the key
* material/config globally across all clusters. Reads should be handled locally, to give a
* sense of where this cluster's replication state is at.
*
* CRL/Revocation and Fetch Certificate APIs are handled by the active node within the cluster
* they originate. This means, if a request comes into a performance secondary cluster, the writes
* will be forwarded to that cluster's active node and not go all the way up to the performance primary's
* active node.
*
* If a certificate issue request has a role in which no_store is set to true, that node itself
* will issue the certificate and not forward the request to the active node, as this does not
* need to write to storage.
*
* Following the same pattern, if a managed key is involved to sign an issued certificate request
* and the local node does not have access for some reason to it, the request will be forwarded to
* the active node within the cluster only.
*
* To make sense of what goes where the following bits need to be analyzed within the codebase.
*
* 1. The backend LocalStorage paths determine what storage paths will remain within a
* cluster and not be forwarded to a performance primary
* 2. Within each path's OperationHandler definition, check to see if ForwardPerformanceStandby &
* ForwardPerformanceSecondary flags are set to short-circuit the request to a given active node
* 3. Within the managed key util class in pki, an initialization failure could cause the request
* to be forwarded to an active node if not already on it.
*/
// Factory creates a new backend implementing the logical.Backend interface
func Factory(ctx context.Context, conf *logical.BackendConfig) (logical.Backend, error) {
b := Backend(conf)
if err := b.Setup(ctx, conf); err != nil {
return nil, err
}
return b, nil
}
// Backend returns a new Backend framework struct
func Backend(conf *logical.BackendConfig) *backend {
var b backend
b.Backend = &framework.Backend{
Help: strings.TrimSpace(backendHelp),
PathsSpecial: &logical.Paths{
Unauthenticated: []string{
"cert/*",
"ca/pem",
"ca_chain",
"ca",
"crl/delta",
"crl/delta/pem",
"crl/pem",
"crl",
"issuer/+/crl/der",
"issuer/+/crl/pem",
"issuer/+/crl",
"issuer/+/crl/delta/der",
"issuer/+/crl/delta/pem",
"issuer/+/crl/delta",
"issuer/+/pem",
"issuer/+/der",
"issuer/+/json",
"issuers/", // LIST operations append a '/' to the requested path
"ocsp", // OCSP POST
"ocsp/*", // OCSP GET
},
LocalStorage: []string{
revokedPath,
deltaWALPath,
legacyCRLPath,
"crls/",
"certs/",
},
Root: []string{
"root",
"root/sign-self-issued",
},
SealWrapStorage: []string{
legacyCertBundlePath,
keyPrefix,
},
},
Paths: []*framework.Path{
pathListRoles(&b),
pathRoles(&b),
pathGenerateRoot(&b),
pathSignIntermediate(&b),
pathSignSelfIssued(&b),
pathDeleteRoot(&b),
pathGenerateIntermediate(&b),
pathSetSignedIntermediate(&b),
pathConfigCA(&b),
pathConfigCRL(&b),
pathConfigURLs(&b),
pathSignVerbatim(&b),
pathSign(&b),
pathIssue(&b),
pathRotateCRL(&b),
pathRotateDeltaCRL(&b),
pathRevoke(&b),
pathRevokeWithKey(&b),
pathTidy(&b),
pathTidyCancel(&b),
pathTidyStatus(&b),
pathConfigAutoTidy(&b),
// Issuer APIs
pathListIssuers(&b),
pathGetIssuer(&b),
pathGetIssuerCRL(&b),
pathImportIssuer(&b),
pathIssuerIssue(&b),
pathIssuerSign(&b),
pathIssuerSignIntermediate(&b),
pathIssuerSignSelfIssued(&b),
pathIssuerSignVerbatim(&b),
pathIssuerGenerateRoot(&b),
pathRotateRoot(&b),
pathIssuerGenerateIntermediate(&b),
pathCrossSignIntermediate(&b),
pathConfigIssuers(&b),
pathReplaceRoot(&b),
pathRevokeIssuer(&b),
// Key APIs
pathListKeys(&b),
pathKey(&b),
pathGenerateKey(&b),
pathImportKey(&b),
pathConfigKeys(&b),
// Fetch APIs have been lowered to favor the newer issuer API endpoints
pathFetchCA(&b),
pathFetchCAChain(&b),
pathFetchCRL(&b),
pathFetchCRLViaCertPath(&b),
pathFetchValidRaw(&b),
pathFetchValid(&b),
pathFetchListCerts(&b),
// OCSP APIs
buildPathOcspGet(&b),
buildPathOcspPost(&b),
},
Secrets: []*framework.Secret{
secretCerts(&b),
},
BackendType: logical.TypeLogical,
InitializeFunc: b.initialize,
Invalidate: b.invalidate,
PeriodicFunc: b.periodicFunc,
}
b.tidyCASGuard = new(uint32)
b.tidyCancelCAS = new(uint32)
b.tidyStatus = &tidyStatus{state: tidyStatusInactive}
b.storage = conf.StorageView
b.backendUUID = conf.BackendUUID
b.pkiStorageVersion.Store(0)
b.crlBuilder = newCRLBuilder()
// Delay the first tidy until after we've started up.
b.lastTidy = time.Now()
return &b
}
type backend struct {
*framework.Backend
backendUUID string
storage logical.Storage
revokeStorageLock sync.RWMutex
tidyCASGuard *uint32
tidyCancelCAS *uint32
tidyStatusLock sync.RWMutex
tidyStatus *tidyStatus
lastTidy time.Time
pkiStorageVersion atomic.Value
crlBuilder *crlBuilder
// Write lock around issuers and keys.
issuersLock sync.RWMutex
}
type (
tidyStatusState int
roleOperation func(ctx context.Context, req *logical.Request, data *framework.FieldData, role *roleEntry) (*logical.Response, error)
)
const (
tidyStatusInactive tidyStatusState = iota
tidyStatusStarted = iota
tidyStatusFinished = iota
tidyStatusError = iota
tidyStatusCancelling = iota
tidyStatusCancelled = iota
)
type tidyStatus struct {
// Parameters used to initiate the operation
safetyBuffer int
tidyCertStore bool
tidyRevokedCerts bool
tidyRevokedAssocs bool
pauseDuration string
// Status
state tidyStatusState
err error
timeStarted time.Time
timeFinished time.Time
message string
certStoreDeletedCount uint
revokedCertDeletedCount uint
missingIssuerCertCount uint
}
const backendHelp = `
The PKI backend dynamically generates X509 server and client certificates.
After mounting this backend, configure the CA using the "pem_bundle" endpoint within
the "config/" path.
`
func metricsKey(req *logical.Request, extra ...string) []string {
if req == nil || req.MountPoint == "" {
return extra
}
key := make([]string, len(extra)+1)
key[0] = req.MountPoint[:len(req.MountPoint)-1]
copy(key[1:], extra)
return key
}
func (b *backend) metricsWrap(callType string, roleMode int, ofunc roleOperation) framework.OperationFunc {
return func(ctx context.Context, req *logical.Request, data *framework.FieldData) (*logical.Response, error) {
key := metricsKey(req, callType)
var role *roleEntry
var labels []metrics.Label
var err error
var roleName string
switch roleMode {
case roleRequired:
roleName = data.Get("role").(string)
case roleOptional:
r, ok := data.GetOk("role")
if ok {
roleName = r.(string)
}
}
if roleMode > noRole {
// Get the role
role, err = b.getRole(ctx, req.Storage, roleName)
if err != nil {
return nil, err
}
if role == nil && (roleMode == roleRequired || len(roleName) > 0) {
return logical.ErrorResponse(fmt.Sprintf("unknown role: %s", roleName)), nil
}
labels = []metrics.Label{{"role", roleName}}
}
ns, err := namespace.FromContext(ctx)
if err == nil {
labels = append(labels, metricsutil.NamespaceLabel(ns))
}
start := time.Now()
defer metrics.MeasureSinceWithLabels(key, start, labels)
resp, err := ofunc(ctx, req, data, role)
if err != nil || resp.IsError() {
metrics.IncrCounterWithLabels(append(key, "failure"), 1.0, labels)
} else {
metrics.IncrCounterWithLabels(key, 1.0, labels)
}
return resp, err
}
}
// initialize is used to perform a possible PKI storage migration if needed
func (b *backend) initialize(ctx context.Context, _ *logical.InitializationRequest) error {
sc := b.makeStorageContext(ctx, b.storage)
if err := b.crlBuilder.reloadConfigIfRequired(sc); err != nil {
return err
}
// Grab the lock prior to the updating of the storage lock preventing us flipping
// the storage flag midway through the request stream of other requests.
b.issuersLock.Lock()
defer b.issuersLock.Unlock()
// Load up our current pki storage state, no matter the host type we are on.
b.updatePkiStorageVersion(ctx, false)
// Early exit if not a primary cluster or performance secondary with a local mount.
if b.System().ReplicationState().HasState(consts.ReplicationDRSecondary|consts.ReplicationPerformanceStandby) ||
(!b.System().LocalMount() && b.System().ReplicationState().HasState(consts.ReplicationPerformanceSecondary)) {
b.Logger().Debug("skipping PKI migration as we are not on primary or secondary with a local mount")
return nil
}
if err := migrateStorage(ctx, b, b.storage); err != nil {
b.Logger().Error("Error during migration of PKI mount: " + err.Error())
return err
}
b.updatePkiStorageVersion(ctx, false)
return nil
}
func (b *backend) useLegacyBundleCaStorage() bool {
// This helper function is here to choose whether or not we use the newer
// issuer/key storage format or the older legacy ca bundle format.
//
// This happens because we might've upgraded secondary PR clusters to
// newer vault code versions. We still want to be able to service requests
// with the old bundle format (e.g., issuing and revoking certs), until
// the primary cluster's active node is upgraded to the newer Vault version
// and the storage is migrated to the new format.
version := b.pkiStorageVersion.Load()
return version == nil || version == 0
}
func (b *backend) updatePkiStorageVersion(ctx context.Context, grabIssuersLock bool) {
info, err := getMigrationInfo(ctx, b.storage)
if err != nil {
b.Logger().Error(fmt.Sprintf("Failed loading PKI migration status, staying in legacy mode: %v", err))
return
}
if grabIssuersLock {
b.issuersLock.Lock()
defer b.issuersLock.Unlock()
}
if info.isRequired {
b.pkiStorageVersion.Store(0)
} else {
b.pkiStorageVersion.Store(1)
}
}
func (b *backend) invalidate(ctx context.Context, key string) {
switch {
case strings.HasPrefix(key, legacyMigrationBundleLogKey):
// This is for a secondary cluster to pick up that the migration has completed
// and reset its compatibility mode and rebuild the CRL locally. Kick it off
// as a go routine to not block this call due to the lock grabbing
// within updatePkiStorageVersion.
go func() {
b.Logger().Info("Detected a migration completed, resetting pki storage version")
b.updatePkiStorageVersion(ctx, true)
b.crlBuilder.requestRebuildIfActiveNode(b)
}()
case strings.HasPrefix(key, issuerPrefix):
if !b.useLegacyBundleCaStorage() {
// See note in updateDefaultIssuerId about why this is necessary.
// We do this ahead of CRL rebuilding just so we know that things
// are stale.
b.crlBuilder.invalidateCRLBuildTime()
// If an issuer has changed on the primary, we need to schedule an update of our CRL,
// the primary cluster would have done it already, but the CRL is cluster specific so
// force a rebuild of ours.
b.crlBuilder.requestRebuildIfActiveNode(b)
} else {
b.Logger().Debug("Ignoring invalidation updates for issuer as the PKI migration has yet to complete.")
}
case key == "config/crl":
// We may need to reload our OCSP status flag
b.crlBuilder.markConfigDirty()
case key == storageIssuerConfig:
b.crlBuilder.invalidateCRLBuildTime()
}
}
func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) error {
sc := b.makeStorageContext(ctx, request.Storage)
doCRL := func() error {
// First attempt to reload the CRL configuration.
if err := b.crlBuilder.reloadConfigIfRequired(sc); err != nil {
return err
}
// As we're (below) modifying the backing storage, we need to ensure
// we're not on a standby/secondary node.
if b.System().ReplicationState().HasState(consts.ReplicationPerformanceStandby) ||
b.System().ReplicationState().HasState(consts.ReplicationDRSecondary) {
return nil
}
// Check if we're set to auto rebuild and a CRL is set to expire.
if err := b.crlBuilder.checkForAutoRebuild(sc); err != nil {
return err
}
// Then attempt to rebuild the CRLs if required.
if err := b.crlBuilder.rebuildIfForced(ctx, b, request); err != nil {
return err
}
// If a delta CRL was rebuilt above as part of the complete CRL rebuild,
// this will be a no-op. However, if we do need to rebuild delta CRLs,
// this would cause us to do so.
if err := b.crlBuilder.rebuildDeltaCRLsIfForced(sc, false); err != nil {
return err
}
return nil
}
doAutoTidy := func() error {
// As we're (below) modifying the backing storage, we need to ensure
// we're not on a standby/secondary node.
if b.System().ReplicationState().HasState(consts.ReplicationPerformanceStandby) ||
b.System().ReplicationState().HasState(consts.ReplicationDRSecondary) {
return nil
}
config, err := sc.getAutoTidyConfig()
if err != nil {
return err
}
if !config.Enabled || config.Interval <= 0*time.Second {
return nil
}
// Check if we should run another tidy...
now := time.Now()
b.tidyStatusLock.RLock()
nextOp := b.lastTidy.Add(config.Interval)
b.tidyStatusLock.RUnlock()
if now.Before(nextOp) {
return nil
}
// Ensure a tidy isn't already running... If it is, we'll trigger
// again when the running one finishes.
if !atomic.CompareAndSwapUint32(b.tidyCASGuard, 0, 1) {
return nil
}
// Prevent ourselves from starting another tidy operation while
// this one is still running. This operation runs in the background
// and has a separate error reporting mechanism.
b.tidyStatusLock.Lock()
b.lastTidy = now
b.tidyStatusLock.Unlock()
// Because the request from the parent storage will be cleared at
// some point (and potentially reused) -- due to tidy executing in
// a background goroutine -- we need to copy the storage entry off
// of the backend instead.
backendReq := &logical.Request{
Storage: b.storage,
}
b.startTidyOperation(backendReq, config)
return nil
}
crlErr := doCRL()
tidyErr := doAutoTidy()
if crlErr != nil && tidyErr != nil {
return fmt.Errorf("Error building CRLs:\n - %v\n\nError running auto-tidy:\n - %v\n", crlErr, tidyErr)
}
if crlErr != nil {
return fmt.Errorf("Error building CRLs:\n - %v\n", crlErr)
}
if tidyErr != nil {
return fmt.Errorf("Error running auto-tidy:\n - %v\n", tidyErr)
}
// Check if the CRL was invalidated due to issuer swap and update
// accordingly.
if err := b.crlBuilder.flushCRLBuildTimeInvalidation(sc); err != nil {
return err
}
// All good!
return nil
}