package pki import ( "context" "fmt" "strings" "sync" "sync/atomic" "time" "github.com/hashicorp/vault/sdk/helper/consts" "github.com/armon/go-metrics" "github.com/hashicorp/vault/helper/metricsutil" "github.com/hashicorp/vault/helper/namespace" "github.com/hashicorp/vault/sdk/framework" "github.com/hashicorp/vault/sdk/logical" ) const ( noRole = 0 roleOptional = 1 roleRequired = 2 ) /* * PKI requests are a bit special to keep up with the various failure and load issues. * * Any requests to write/delete shared data (such as roles, issuers, keys, and configuration) * are always forwarded to the Primary cluster's active node to write and send the key * material/config globally across all clusters. Reads should be handled locally, to give a * sense of where this cluster's replication state is at. * * CRL/Revocation and Fetch Certificate APIs are handled by the active node within the cluster * they originate. This means, if a request comes into a performance secondary cluster, the writes * will be forwarded to that cluster's active node and not go all the way up to the performance primary's * active node. * * If a certificate issue request has a role in which no_store is set to true, that node itself * will issue the certificate and not forward the request to the active node, as this does not * need to write to storage. * * Following the same pattern, if a managed key is involved to sign an issued certificate request * and the local node does not have access for some reason to it, the request will be forwarded to * the active node within the cluster only. * * To make sense of what goes where the following bits need to be analyzed within the codebase. * * 1. The backend LocalStorage paths determine what storage paths will remain within a * cluster and not be forwarded to a performance primary * 2. Within each path's OperationHandler definition, check to see if ForwardPerformanceStandby & * ForwardPerformanceSecondary flags are set to short-circuit the request to a given active node * 3. Within the managed key util class in pki, an initialization failure could cause the request * to be forwarded to an active node if not already on it. */ // Factory creates a new backend implementing the logical.Backend interface func Factory(ctx context.Context, conf *logical.BackendConfig) (logical.Backend, error) { b := Backend(conf) if err := b.Setup(ctx, conf); err != nil { return nil, err } return b, nil } // Backend returns a new Backend framework struct func Backend(conf *logical.BackendConfig) *backend { var b backend b.Backend = &framework.Backend{ Help: strings.TrimSpace(backendHelp), PathsSpecial: &logical.Paths{ Unauthenticated: []string{ "cert/*", "ca/pem", "ca_chain", "ca", "crl/delta", "crl/delta/pem", "crl/pem", "crl", "issuer/+/crl/der", "issuer/+/crl/pem", "issuer/+/crl", "issuer/+/crl/delta/der", "issuer/+/crl/delta/pem", "issuer/+/crl/delta", "issuer/+/pem", "issuer/+/der", "issuer/+/json", "issuers/", // LIST operations append a '/' to the requested path "ocsp", // OCSP POST "ocsp/*", // OCSP GET }, LocalStorage: []string{ revokedPath, deltaWALPath, legacyCRLPath, "crls/", "certs/", }, Root: []string{ "root", "root/sign-self-issued", }, SealWrapStorage: []string{ legacyCertBundlePath, keyPrefix, }, }, Paths: []*framework.Path{ pathListRoles(&b), pathRoles(&b), pathGenerateRoot(&b), pathSignIntermediate(&b), pathSignSelfIssued(&b), pathDeleteRoot(&b), pathGenerateIntermediate(&b), pathSetSignedIntermediate(&b), pathConfigCA(&b), pathConfigCRL(&b), pathConfigURLs(&b), pathSignVerbatim(&b), pathSign(&b), pathIssue(&b), pathRotateCRL(&b), pathRotateDeltaCRL(&b), pathRevoke(&b), pathRevokeWithKey(&b), pathTidy(&b), pathTidyCancel(&b), pathTidyStatus(&b), pathConfigAutoTidy(&b), // Issuer APIs pathListIssuers(&b), pathGetIssuer(&b), pathGetIssuerCRL(&b), pathImportIssuer(&b), pathIssuerIssue(&b), pathIssuerSign(&b), pathIssuerSignIntermediate(&b), pathIssuerSignSelfIssued(&b), pathIssuerSignVerbatim(&b), pathIssuerGenerateRoot(&b), pathRotateRoot(&b), pathIssuerGenerateIntermediate(&b), pathCrossSignIntermediate(&b), pathConfigIssuers(&b), pathReplaceRoot(&b), pathRevokeIssuer(&b), // Key APIs pathListKeys(&b), pathKey(&b), pathGenerateKey(&b), pathImportKey(&b), pathConfigKeys(&b), // Fetch APIs have been lowered to favor the newer issuer API endpoints pathFetchCA(&b), pathFetchCAChain(&b), pathFetchCRL(&b), pathFetchCRLViaCertPath(&b), pathFetchValidRaw(&b), pathFetchValid(&b), pathFetchListCerts(&b), // OCSP APIs buildPathOcspGet(&b), buildPathOcspPost(&b), }, Secrets: []*framework.Secret{ secretCerts(&b), }, BackendType: logical.TypeLogical, InitializeFunc: b.initialize, Invalidate: b.invalidate, PeriodicFunc: b.periodicFunc, } b.tidyCASGuard = new(uint32) b.tidyCancelCAS = new(uint32) b.tidyStatus = &tidyStatus{state: tidyStatusInactive} b.storage = conf.StorageView b.backendUUID = conf.BackendUUID b.pkiStorageVersion.Store(0) b.crlBuilder = newCRLBuilder() // Delay the first tidy until after we've started up. b.lastTidy = time.Now() return &b } type backend struct { *framework.Backend backendUUID string storage logical.Storage revokeStorageLock sync.RWMutex tidyCASGuard *uint32 tidyCancelCAS *uint32 tidyStatusLock sync.RWMutex tidyStatus *tidyStatus lastTidy time.Time pkiStorageVersion atomic.Value crlBuilder *crlBuilder // Write lock around issuers and keys. issuersLock sync.RWMutex } type ( tidyStatusState int roleOperation func(ctx context.Context, req *logical.Request, data *framework.FieldData, role *roleEntry) (*logical.Response, error) ) const ( tidyStatusInactive tidyStatusState = iota tidyStatusStarted = iota tidyStatusFinished = iota tidyStatusError = iota tidyStatusCancelling = iota tidyStatusCancelled = iota ) type tidyStatus struct { // Parameters used to initiate the operation safetyBuffer int tidyCertStore bool tidyRevokedCerts bool tidyRevokedAssocs bool pauseDuration string // Status state tidyStatusState err error timeStarted time.Time timeFinished time.Time message string certStoreDeletedCount uint revokedCertDeletedCount uint missingIssuerCertCount uint } const backendHelp = ` The PKI backend dynamically generates X509 server and client certificates. After mounting this backend, configure the CA using the "pem_bundle" endpoint within the "config/" path. ` func metricsKey(req *logical.Request, extra ...string) []string { if req == nil || req.MountPoint == "" { return extra } key := make([]string, len(extra)+1) key[0] = req.MountPoint[:len(req.MountPoint)-1] copy(key[1:], extra) return key } func (b *backend) metricsWrap(callType string, roleMode int, ofunc roleOperation) framework.OperationFunc { return func(ctx context.Context, req *logical.Request, data *framework.FieldData) (*logical.Response, error) { key := metricsKey(req, callType) var role *roleEntry var labels []metrics.Label var err error var roleName string switch roleMode { case roleRequired: roleName = data.Get("role").(string) case roleOptional: r, ok := data.GetOk("role") if ok { roleName = r.(string) } } if roleMode > noRole { // Get the role role, err = b.getRole(ctx, req.Storage, roleName) if err != nil { return nil, err } if role == nil && (roleMode == roleRequired || len(roleName) > 0) { return logical.ErrorResponse(fmt.Sprintf("unknown role: %s", roleName)), nil } labels = []metrics.Label{{"role", roleName}} } ns, err := namespace.FromContext(ctx) if err == nil { labels = append(labels, metricsutil.NamespaceLabel(ns)) } start := time.Now() defer metrics.MeasureSinceWithLabels(key, start, labels) resp, err := ofunc(ctx, req, data, role) if err != nil || resp.IsError() { metrics.IncrCounterWithLabels(append(key, "failure"), 1.0, labels) } else { metrics.IncrCounterWithLabels(key, 1.0, labels) } return resp, err } } // initialize is used to perform a possible PKI storage migration if needed func (b *backend) initialize(ctx context.Context, _ *logical.InitializationRequest) error { sc := b.makeStorageContext(ctx, b.storage) if err := b.crlBuilder.reloadConfigIfRequired(sc); err != nil { return err } // Grab the lock prior to the updating of the storage lock preventing us flipping // the storage flag midway through the request stream of other requests. b.issuersLock.Lock() defer b.issuersLock.Unlock() // Load up our current pki storage state, no matter the host type we are on. b.updatePkiStorageVersion(ctx, false) // Early exit if not a primary cluster or performance secondary with a local mount. if b.System().ReplicationState().HasState(consts.ReplicationDRSecondary|consts.ReplicationPerformanceStandby) || (!b.System().LocalMount() && b.System().ReplicationState().HasState(consts.ReplicationPerformanceSecondary)) { b.Logger().Debug("skipping PKI migration as we are not on primary or secondary with a local mount") return nil } if err := migrateStorage(ctx, b, b.storage); err != nil { b.Logger().Error("Error during migration of PKI mount: " + err.Error()) return err } b.updatePkiStorageVersion(ctx, false) return nil } func (b *backend) useLegacyBundleCaStorage() bool { // This helper function is here to choose whether or not we use the newer // issuer/key storage format or the older legacy ca bundle format. // // This happens because we might've upgraded secondary PR clusters to // newer vault code versions. We still want to be able to service requests // with the old bundle format (e.g., issuing and revoking certs), until // the primary cluster's active node is upgraded to the newer Vault version // and the storage is migrated to the new format. version := b.pkiStorageVersion.Load() return version == nil || version == 0 } func (b *backend) updatePkiStorageVersion(ctx context.Context, grabIssuersLock bool) { info, err := getMigrationInfo(ctx, b.storage) if err != nil { b.Logger().Error(fmt.Sprintf("Failed loading PKI migration status, staying in legacy mode: %v", err)) return } if grabIssuersLock { b.issuersLock.Lock() defer b.issuersLock.Unlock() } if info.isRequired { b.pkiStorageVersion.Store(0) } else { b.pkiStorageVersion.Store(1) } } func (b *backend) invalidate(ctx context.Context, key string) { switch { case strings.HasPrefix(key, legacyMigrationBundleLogKey): // This is for a secondary cluster to pick up that the migration has completed // and reset its compatibility mode and rebuild the CRL locally. Kick it off // as a go routine to not block this call due to the lock grabbing // within updatePkiStorageVersion. go func() { b.Logger().Info("Detected a migration completed, resetting pki storage version") b.updatePkiStorageVersion(ctx, true) b.crlBuilder.requestRebuildIfActiveNode(b) }() case strings.HasPrefix(key, issuerPrefix): if !b.useLegacyBundleCaStorage() { // See note in updateDefaultIssuerId about why this is necessary. // We do this ahead of CRL rebuilding just so we know that things // are stale. b.crlBuilder.invalidateCRLBuildTime() // If an issuer has changed on the primary, we need to schedule an update of our CRL, // the primary cluster would have done it already, but the CRL is cluster specific so // force a rebuild of ours. b.crlBuilder.requestRebuildIfActiveNode(b) } else { b.Logger().Debug("Ignoring invalidation updates for issuer as the PKI migration has yet to complete.") } case key == "config/crl": // We may need to reload our OCSP status flag b.crlBuilder.markConfigDirty() case key == storageIssuerConfig: b.crlBuilder.invalidateCRLBuildTime() } } func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) error { sc := b.makeStorageContext(ctx, request.Storage) doCRL := func() error { // First attempt to reload the CRL configuration. if err := b.crlBuilder.reloadConfigIfRequired(sc); err != nil { return err } // As we're (below) modifying the backing storage, we need to ensure // we're not on a standby/secondary node. if b.System().ReplicationState().HasState(consts.ReplicationPerformanceStandby) || b.System().ReplicationState().HasState(consts.ReplicationDRSecondary) { return nil } // Check if we're set to auto rebuild and a CRL is set to expire. if err := b.crlBuilder.checkForAutoRebuild(sc); err != nil { return err } // Then attempt to rebuild the CRLs if required. if err := b.crlBuilder.rebuildIfForced(ctx, b, request); err != nil { return err } // If a delta CRL was rebuilt above as part of the complete CRL rebuild, // this will be a no-op. However, if we do need to rebuild delta CRLs, // this would cause us to do so. if err := b.crlBuilder.rebuildDeltaCRLsIfForced(sc, false); err != nil { return err } return nil } doAutoTidy := func() error { // As we're (below) modifying the backing storage, we need to ensure // we're not on a standby/secondary node. if b.System().ReplicationState().HasState(consts.ReplicationPerformanceStandby) || b.System().ReplicationState().HasState(consts.ReplicationDRSecondary) { return nil } config, err := sc.getAutoTidyConfig() if err != nil { return err } if !config.Enabled || config.Interval <= 0*time.Second { return nil } // Check if we should run another tidy... now := time.Now() b.tidyStatusLock.RLock() nextOp := b.lastTidy.Add(config.Interval) b.tidyStatusLock.RUnlock() if now.Before(nextOp) { return nil } // Ensure a tidy isn't already running... If it is, we'll trigger // again when the running one finishes. if !atomic.CompareAndSwapUint32(b.tidyCASGuard, 0, 1) { return nil } // Prevent ourselves from starting another tidy operation while // this one is still running. This operation runs in the background // and has a separate error reporting mechanism. b.tidyStatusLock.Lock() b.lastTidy = now b.tidyStatusLock.Unlock() // Because the request from the parent storage will be cleared at // some point (and potentially reused) -- due to tidy executing in // a background goroutine -- we need to copy the storage entry off // of the backend instead. backendReq := &logical.Request{ Storage: b.storage, } b.startTidyOperation(backendReq, config) return nil } crlErr := doCRL() tidyErr := doAutoTidy() if crlErr != nil && tidyErr != nil { return fmt.Errorf("Error building CRLs:\n - %v\n\nError running auto-tidy:\n - %v\n", crlErr, tidyErr) } if crlErr != nil { return fmt.Errorf("Error building CRLs:\n - %v\n", crlErr) } if tidyErr != nil { return fmt.Errorf("Error running auto-tidy:\n - %v\n", tidyErr) } // Check if the CRL was invalidated due to issuer swap and update // accordingly. if err := b.crlBuilder.flushCRLBuildTimeInvalidation(sc); err != nil { return err } // All good! return nil }