open-nomad/nomad/vault.go
Luiz Aoqui ab7eb5de6e
Support Vault entity aliases (#12449)
Move some common Vault API data struct decoding out of the Vault client
so it can be reused in other situations.

Make Vault job validation its own function so it's easier to expand it.

Rename the `Job.VaultPolicies` method to just `Job.Vault` since it
returns the full Vault block, not just their policies.

Set `ChangeMode` on `Vault.Canonicalize`.

Add some missing tests.

Allows specifying an entity alias that will be used by Nomad when
deriving the task Vault token.

An entity alias assigns an indentity to a token, allowing better control
and management of Vault clients since all tokens with the same indentity
alias will now be considered the same client. This helps track Nomad
activity in Vault's audit logs and better control over Vault billing.

Add support for a new Nomad server configuration to define a default
entity alias to be used when deriving Vault tokens. This default value
will be used if the task doesn't have an entity alias defined.
2022-04-05 14:18:10 -04:00

1474 lines
44 KiB
Go

package nomad
import (
"context"
"errors"
"fmt"
"math/rand"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/hashicorp/nomad/helper"
tomb "gopkg.in/tomb.v2"
metrics "github.com/armon/go-metrics"
log "github.com/hashicorp/go-hclog"
multierror "github.com/hashicorp/go-multierror"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/nomad/structs/config"
vapi "github.com/hashicorp/vault/api"
"golang.org/x/sync/errgroup"
"golang.org/x/time/rate"
)
const (
// vaultTokenCreateTTL is the duration the wrapped token for the client is
// valid for. The units are in seconds.
vaultTokenCreateTTL = "60s"
// minimumTokenTTL is the minimum Token TTL allowed for child tokens.
minimumTokenTTL = 5 * time.Minute
// defaultTokenTTL is the default Token TTL used when the passed token is a
// root token such that child tokens aren't being created against a role
// that has defined a TTL
defaultTokenTTL = "72h"
// requestRateLimit is the maximum number of requests per second Nomad will
// make against Vault
requestRateLimit rate.Limit = 500.0
// maxParallelRevokes is the maximum number of parallel Vault
// token revocation requests
maxParallelRevokes = 64
// vaultRevocationIntv is the interval at which Vault tokens that failed
// initial revocation are retried
vaultRevocationIntv = 5 * time.Minute
// vaultCapabilitiesLookupPath is the path to lookup the capabilities of
// ones token.
vaultCapabilitiesLookupPath = "sys/capabilities-self"
// vaultTokenRenewPath is the path used to renew our token
vaultTokenRenewPath = "auth/token/renew-self"
// vaultTokenLookupPath is the path used to lookup a token
vaultTokenLookupPath = "auth/token/lookup"
// vaultTokenRevokePath is the path used to revoke a token
vaultTokenRevokePath = "auth/token/revoke-accessor"
// vaultRoleLookupPath is the path to lookup a role
vaultRoleLookupPath = "auth/token/roles/%s"
// vaultRoleCreatePath is the path to create a token from a role
vaultTokenRoleCreatePath = "auth/token/create/%s"
)
var (
// vaultCapabilitiesCapability is the expected capability of Nomad's Vault
// token on the the path. The token must have at least one of the
// capabilities.
vaultCapabilitiesCapability = []string{"update", "root"}
// vaultTokenRenewCapability is the expected capability Nomad's
// Vault token should have on the path. The token must have at least one of
// the capabilities.
vaultTokenRenewCapability = []string{"update", "root"}
// vaultTokenLookupCapability is the expected capability Nomad's
// Vault token should have on the path. The token must have at least one of
// the capabilities.
vaultTokenLookupCapability = []string{"update", "root"}
// vaultTokenRevokeCapability is the expected capability Nomad's
// Vault token should have on the path. The token must have at least one of
// the capabilities.
vaultTokenRevokeCapability = []string{"update", "root"}
// vaultRoleLookupCapability is the the expected capability Nomad's Vault
// token should have on the path. The token must have at least one of the
// capabilities.
vaultRoleLookupCapability = []string{"read", "root"}
// vaultTokenRoleCreateCapability is the the expected capability Nomad's Vault
// token should have on the path. The token must have at least one of the
// capabilities.
vaultTokenRoleCreateCapability = []string{"update", "root"}
)
// VaultClient is the Servers interface for interfacing with Vault
type VaultClient interface {
// SetActive activates or de-activates the Vault client. When active, token
// creation/lookup/revocation operation are allowed.
SetActive(active bool)
// SetConfig updates the config used by the Vault client
SetConfig(config *config.VaultConfig) error
// CreateToken takes an allocation and task and returns an appropriate Vault
// Secret
CreateToken(ctx context.Context, a *structs.Allocation, task string) (*vapi.Secret, error)
// LookupToken takes a token string and returns its capabilities.
LookupToken(ctx context.Context, token string) (*vapi.Secret, error)
// LookupTokenRole takes a role name string and returns its data.
LookupTokenRole(ctx context.Context, role string) (*vapi.Secret, error)
// RevokeTokens takes a set of tokens accessor and revokes the tokens
RevokeTokens(ctx context.Context, accessors []*structs.VaultAccessor, committed bool) error
// MarkForRevocation revokes the tokens in background
MarkForRevocation(accessors []*structs.VaultAccessor) error
// Stop is used to stop token renewal
Stop()
// Running returns whether the Vault client is running
Running() bool
// Stats returns the Vault clients statistics
Stats() map[string]string
// EmitStats emits that clients statistics at the given period until stopCh
// is called.
EmitStats(period time.Duration, stopCh <-chan struct{})
}
// VaultStats returns all the stats about Vault tokens created and managed by
// Nomad.
type VaultStats struct {
// TrackedForRevoke is the count of tokens that are being tracked to be
// revoked since they could not be immediately revoked.
TrackedForRevoke int
// TokenTTL is the time-to-live duration for the current token
TokenTTL time.Duration
// TokenExpiry is the recorded expiry time of the current token
TokenExpiry time.Time
}
// PurgeVaultAccessorFn is called to remove VaultAccessors from the system. If
// the function returns an error, the token will still be tracked and revocation
// will retry till there is a success
type PurgeVaultAccessorFn func(accessors []*structs.VaultAccessor) error
// vaultClient is the Servers implementation of the VaultClient interface. The
// client renews the PeriodicToken given in the Vault configuration and provides
// the Server with the ability to create child tokens and lookup the permissions
// of tokens.
type vaultClient struct {
// limiter is used to rate limit requests to Vault
limiter *rate.Limiter
// client is the Vault API client used for Namespace-relative integrations
// with the Vault API (anything except `/v1/sys`). If this server is not
// configured to reference a Vault namespace, this will point to the same
// client as clientSys
client *vapi.Client
// clientSys is the Vault API client used for non-Namespace-relative integrations
// with the Vault API (anything involving `/v1/sys`). This client is never configured
// with a Vault namespace, because these endpoints may return errors if a namespace
// header is provided
clientSys *vapi.Client
// auth is the Vault token auth API client
auth *vapi.TokenAuth
// config is the user passed Vault config
config *config.VaultConfig
// connEstablished marks whether we have an established connection to Vault.
connEstablished bool
// connEstablishedErr marks an error that can occur when establishing a
// connection
connEstablishedErr error
// token is the raw token used by the client
token string
// tokenData is the data of the passed Vault token
tokenData *structs.VaultTokenData
// revoking tracks the VaultAccessors that must be revoked
revoking map[*structs.VaultAccessor]time.Time
purgeFn PurgeVaultAccessorFn
revLock sync.Mutex
// active indicates whether the vaultClient is active. It should be
// accessed using a helper and updated atomically
active int32
// running indicates whether the vault client is started.
running bool
// renewLoopActive indicates whether the renewal goroutine is running
// It should be accessed and updated atomically
// used for testing purposes only
renewLoopActive int32
// childTTL is the TTL for child tokens.
childTTL string
// currentExpiration is the time the current token lease expires
currentExpiration time.Time
currentExpirationLock sync.Mutex
tomb *tomb.Tomb
logger log.Logger
// l is used to lock the configuration aspects of the client such that
// multiple callers can't cause conflicting config updates
l sync.Mutex
// setConfigLock serializes access to the SetConfig method
setConfigLock sync.Mutex
// consts as struct fields for overriding in tests
maxRevokeBatchSize int
revocationIntv time.Duration
entHandler taskClientHandler
}
type taskClientHandler interface {
clientForTask(v *vaultClient, namespace string) (*vapi.Client, error)
}
// NewVaultClient returns a Vault client from the given config. If the client
// couldn't be made an error is returned.
func NewVaultClient(c *config.VaultConfig, logger log.Logger, purgeFn PurgeVaultAccessorFn, delegate taskClientHandler) (*vaultClient, error) {
if c == nil {
return nil, fmt.Errorf("must pass valid VaultConfig")
}
if logger == nil {
return nil, fmt.Errorf("must pass valid logger")
}
if purgeFn == nil {
purgeFn = func(accessors []*structs.VaultAccessor) error { return nil }
}
if delegate == nil {
delegate = &VaultNoopDelegate{}
}
v := &vaultClient{
config: c,
logger: logger.Named("vault"),
limiter: rate.NewLimiter(requestRateLimit, int(requestRateLimit)),
revoking: make(map[*structs.VaultAccessor]time.Time),
purgeFn: purgeFn,
tomb: &tomb.Tomb{},
maxRevokeBatchSize: maxVaultRevokeBatchSize,
revocationIntv: vaultRevocationIntv,
entHandler: delegate,
}
if v.config.IsEnabled() {
if err := v.buildClient(); err != nil {
return nil, err
}
// Launch the required goroutines
v.tomb.Go(wrapNilError(v.establishConnection))
v.tomb.Go(wrapNilError(v.revokeDaemon))
v.running = true
}
return v, nil
}
func (v *vaultClient) Stop() {
v.l.Lock()
running := v.running
v.running = false
v.l.Unlock()
if running {
v.tomb.Kill(nil)
v.tomb.Wait()
v.flush()
}
}
func (v *vaultClient) Running() bool {
v.l.Lock()
defer v.l.Unlock()
return v.running
}
// SetActive activates or de-activates the Vault client. When active, token
// creation/lookup/revocation operation are allowed. All queued revocations are
// cancelled if set un-active as it is assumed another instances is taking over
func (v *vaultClient) SetActive(active bool) {
if active {
atomic.StoreInt32(&v.active, 1)
} else {
atomic.StoreInt32(&v.active, 0)
}
// Clear out the revoking tokens
v.revLock.Lock()
v.revoking = make(map[*structs.VaultAccessor]time.Time)
v.revLock.Unlock()
}
// flush is used to reset the state of the vault client
func (v *vaultClient) flush() {
v.l.Lock()
defer v.l.Unlock()
v.revLock.Lock()
defer v.revLock.Unlock()
v.client = nil
v.clientSys = nil
v.auth = nil
v.connEstablished = false
v.connEstablishedErr = nil
v.token = ""
v.tokenData = nil
v.revoking = make(map[*structs.VaultAccessor]time.Time)
v.childTTL = ""
v.tomb = &tomb.Tomb{}
}
// SetConfig is used to update the Vault config being used. A temporary outage
// may occur after calling as it re-establishes a connection to Vault
func (v *vaultClient) SetConfig(config *config.VaultConfig) error {
if config == nil {
return fmt.Errorf("must pass valid VaultConfig")
}
v.setConfigLock.Lock()
defer v.setConfigLock.Unlock()
v.l.Lock()
defer v.l.Unlock()
// If reloading the same config, no-op
if v.config.IsEqual(config) {
return nil
}
// Kill any background routines
if v.running {
// Kill any background routine
v.tomb.Kill(nil)
// Locking around tomb.Wait can deadlock with
// establishConnection exiting, so we must unlock here.
v.l.Unlock()
v.tomb.Wait()
v.l.Lock()
// Stop accepting any new requests
v.connEstablished = false
v.tomb = &tomb.Tomb{}
v.running = false
}
// Store the new config
v.config = config
// Check if we should relaunch
if v.config.IsEnabled() {
// Rebuild the client
if err := v.buildClient(); err != nil {
return err
}
// Launch the required goroutines
v.tomb.Go(wrapNilError(v.establishConnection))
v.tomb.Go(wrapNilError(v.revokeDaemon))
v.running = true
}
return nil
}
// buildClient is used to build a Vault client based on the stored Vault config
func (v *vaultClient) buildClient() error {
// Validate we have the required fields.
if v.config.Token == "" {
return errors.New("Vault token must be set")
} else if v.config.Addr == "" {
return errors.New("Vault address must be set")
}
// Parse the TTL if it is set
if v.config.TaskTokenTTL != "" {
d, err := time.ParseDuration(v.config.TaskTokenTTL)
if err != nil {
return fmt.Errorf("failed to parse TaskTokenTTL %q: %v", v.config.TaskTokenTTL, err)
}
if d.Nanoseconds() < minimumTokenTTL.Nanoseconds() {
return fmt.Errorf("ChildTokenTTL is less than minimum allowed of %v", minimumTokenTTL)
}
v.childTTL = v.config.TaskTokenTTL
} else {
// Default the TaskTokenTTL
v.childTTL = defaultTokenTTL
}
// Get the Vault API configuration
apiConf, err := v.config.ApiConfig()
if err != nil {
return fmt.Errorf("Failed to create Vault API config: %v", err)
}
// Create the Vault API client
client, err := vapi.NewClient(apiConf)
if err != nil {
v.logger.Error("failed to create Vault client and not retrying", "error", err)
return err
}
// Store the client, create/assign the /sys client
v.client = client
if v.config.Namespace != "" {
v.logger.Debug("configuring Vault namespace", "namespace", v.config.Namespace)
v.clientSys, err = vapi.NewClient(apiConf)
if err != nil {
v.logger.Error("failed to create Vault sys client and not retrying", "error", err)
return err
}
client.SetNamespace(v.config.Namespace)
} else {
v.clientSys = client
}
// Set the token
v.token = v.config.Token
client.SetToken(v.token)
v.auth = client.Auth().Token()
return nil
}
// establishConnection is used to make first contact with Vault. This should be
// called in a go-routine since the connection is retried until the Vault Client
// is stopped or the connection is successfully made at which point the renew
// loop is started.
func (v *vaultClient) establishConnection() {
// Create the retry timer and set initial duration to zero so it fires
// immediately
retryTimer := time.NewTimer(0)
initStatus := false
OUTER:
for {
select {
case <-v.tomb.Dying():
return
case <-retryTimer.C:
// Retry validating the token till success
if err := v.parseSelfToken(); err != nil {
// if parsing token fails, try to distinguish legitimate token error from transient Vault initialization/connection issue
if !initStatus {
if _, err := v.clientSys.Sys().Health(); err != nil {
v.logger.Warn("failed to contact Vault API", "retry", v.config.ConnectionRetryIntv, "error", err)
retryTimer.Reset(v.config.ConnectionRetryIntv)
continue OUTER
}
initStatus = true
}
v.logger.Error("failed to validate self token/role", "retry", v.config.ConnectionRetryIntv, "error", err)
retryTimer.Reset(v.config.ConnectionRetryIntv)
v.l.Lock()
v.connEstablished = true
v.connEstablishedErr = fmt.Errorf("failed to establish connection to Vault: %v", err)
v.l.Unlock()
continue OUTER
}
break OUTER
}
}
// Set the wrapping function such that token creation is wrapped now
// that we know our role
v.client.SetWrappingLookupFunc(v.getWrappingFn())
// If we are given a non-root token, start renewing it
if v.tokenData.Root() && v.tokenData.CreationTTL == 0 {
v.logger.Debug("not renewing token as it is root")
} else {
v.logger.Debug("starting renewal loop", "creation_ttl", time.Duration(v.tokenData.CreationTTL)*time.Second)
v.tomb.Go(wrapNilError(v.renewalLoop))
}
v.l.Lock()
v.connEstablished = true
v.connEstablishedErr = nil
v.l.Unlock()
}
func (v *vaultClient) isRenewLoopActive() bool {
return atomic.LoadInt32(&v.renewLoopActive) == 1
}
// renewalLoop runs the renew loop. This should only be called if we are given a
// non-root token.
func (v *vaultClient) renewalLoop() {
atomic.StoreInt32(&v.renewLoopActive, 1)
defer atomic.StoreInt32(&v.renewLoopActive, 0)
// Create the renewal timer and set initial duration to zero so it fires
// immediately
authRenewTimer := time.NewTimer(0)
// Backoff is to reduce the rate we try to renew with Vault under error
// situations
backoff := 0.0
for {
select {
case <-v.tomb.Dying():
return
case <-authRenewTimer.C:
// Renew the token and determine the new expiration
recoverable, err := v.renew()
v.currentExpirationLock.Lock()
currentExpiration := v.currentExpiration
v.currentExpirationLock.Unlock()
// Successfully renewed
if err == nil {
// Attempt to renew the token at half the expiration time
durationUntilRenew := time.Until(currentExpiration) / 2
v.logger.Info("successfully renewed token", "next_renewal", durationUntilRenew)
authRenewTimer.Reset(durationUntilRenew)
// Reset any backoff
backoff = 0
break
}
metrics.IncrCounter([]string{"nomad", "vault", "renew_failed"}, 1)
v.logger.Warn("got error or bad auth, so backing off", "error", err, "recoverable", recoverable)
if !recoverable {
return
}
backoff = nextBackoff(backoff, currentExpiration)
if backoff < 0 {
// We have failed to renew the token past its expiration. Stop
// renewing with Vault.
v.logger.Error("failed to renew Vault token before lease expiration. Shutting down Vault client",
"error", err)
v.l.Lock()
v.connEstablished = false
v.connEstablishedErr = err
v.l.Unlock()
return
}
durationUntilRetry := time.Duration(backoff) * time.Second
v.logger.Info("backing off renewal", "retry", durationUntilRetry)
authRenewTimer.Reset(durationUntilRetry)
}
}
}
// nextBackoff returns the delay for the next auto renew interval, in seconds.
// Returns negative value if past expiration
//
// It should increase the amount of backoff each time, with the following rules:
//
// * If token expired already despite earlier renewal attempts,
// back off for 1 minute + jitter
// * If we have an existing authentication that is going to expire,
// never back off more than half of the amount of time remaining
// until expiration (with 5s floor)
// * Never back off more than 30 seconds multiplied by a random
// value between 1 and 2
// * Use randomness so that many clients won't keep hitting Vault
// at the same time
func nextBackoff(backoff float64, expiry time.Time) float64 {
maxBackoff := time.Until(expiry) / 2
if maxBackoff < 0 {
// expiry passed
return 60 * (1.0 + rand.Float64())
}
switch {
case backoff >= 24:
backoff = 30
default:
backoff = backoff * 1.25
}
// Add randomness
backoff = backoff * (1.0 + rand.Float64())
if backoff > maxBackoff.Seconds() {
backoff = maxBackoff.Seconds()
}
if backoff < 5 {
backoff = 5
}
return backoff
}
// renew attempts to renew our Vault token. If the renewal fails, an error is
// returned. The boolean indicates whether it's safe to attempt to renew again.
// This method updates the currentExpiration time
func (v *vaultClient) renew() (bool, error) {
// Track how long the request takes
defer metrics.MeasureSince([]string{"nomad", "vault", "renew"}, time.Now())
// Attempt to renew the token
secret, err := v.auth.RenewSelf(v.tokenData.CreationTTL)
if err != nil {
// Check if there is a permission denied
recoverable := !structs.VaultUnrecoverableError.MatchString(err.Error())
return recoverable, fmt.Errorf("failed to renew the vault token: %v", err)
}
if secret == nil {
// It's possible for RenewSelf to return (nil, nil) if the
// response body from Vault is empty.
return true, fmt.Errorf("renewal failed: empty response from vault")
}
// these treated as transient errors, where can keep renewing
auth := secret.Auth
if auth == nil {
return true, fmt.Errorf("renewal successful but not auth information returned")
} else if auth.LeaseDuration == 0 {
return true, fmt.Errorf("renewal successful but no lease duration returned")
}
v.extendExpiration(auth.LeaseDuration)
v.logger.Debug("successfully renewed server token")
return true, nil
}
// getWrappingFn returns an appropriate wrapping function for Nomad Servers
func (v *vaultClient) getWrappingFn() func(operation, path string) string {
createPath := "auth/token/create"
role := v.getRole()
if role != "" {
createPath = fmt.Sprintf("auth/token/create/%s", role)
}
return func(operation, path string) string {
// Only wrap the token create operation
if operation != "POST" || path != createPath {
return ""
}
return vaultTokenCreateTTL
}
}
// parseSelfToken looks up the Vault token in Vault and parses its data storing
// it in the client. If the token is not valid for Nomads purposes an error is
// returned.
func (v *vaultClient) parseSelfToken() error {
// Try looking up the token using the self endpoint
secret, err := v.lookupSelf()
if err != nil {
return err
}
// Read and parse the fields
var data structs.VaultTokenData
if err := structs.DecodeVaultSecretData(secret, &data); err != nil {
return fmt.Errorf("failed to parse Vault token's data block: %v", err)
}
v.tokenData = &data
v.extendExpiration(data.TTL)
// The criteria that must be met for the token to be valid are as follows:
// 1) If token is non-root or is but has a creation ttl
// a) The token must be renewable
// b) Token must have a non-zero TTL
// 2) Must have update capability for "auth/token/lookup/" (used to verify incoming tokens)
// 3) Must have update capability for "/auth/token/revoke-accessor/" (used to revoke unneeded tokens)
// 4) If configured to create tokens against a role:
// a) Must have read capability for "auth/token/roles/<role_name" (Can just attempt a read)
// b) Must have update capability for path "auth/token/create/<role_name>"
// c) Role must:
// 1) Must allow tokens to be renewed
// 2) Must not have an explicit max TTL
// 3) Must have non-zero period
// 4) Must allow entity alias if one is defined
// 5) Must have a role if an entity alias is defined
// 6) If not configured against a role, the token must be root
var mErr multierror.Error
role := v.getRole()
if !data.Root() {
// All non-root tokens must be renewable
if !data.Renewable {
_ = multierror.Append(&mErr, fmt.Errorf("Vault token is not renewable or root"))
}
// All non-root tokens must have a lease duration
if data.CreationTTL == 0 {
_ = multierror.Append(&mErr, fmt.Errorf("invalid lease duration of zero"))
}
// The lease duration can not be expired
if data.TTL == 0 {
_ = multierror.Append(&mErr, fmt.Errorf("token TTL is zero"))
}
// There must be a valid role since we aren't root
if role == "" {
_ = multierror.Append(&mErr, fmt.Errorf("token role name must be set when not using a root token"))
}
} else if data.CreationTTL != 0 {
// If the root token has a TTL it must be renewable
if !data.Renewable {
_ = multierror.Append(&mErr, fmt.Errorf("Vault token has a TTL but is not renewable"))
} else if data.TTL == 0 {
// If the token has a TTL make sure it has not expired
_ = multierror.Append(&mErr, fmt.Errorf("token TTL is zero"))
}
}
// Check we have the correct capabilities
if err := v.validateCapabilities(role, data.Root()); err != nil {
_ = multierror.Append(&mErr, err)
}
// If given a role validate it
if role != "" {
if err := v.validateRole(role); err != nil {
_ = multierror.Append(&mErr, err)
}
} else if v.config.EntityAlias != "" {
// If entity alias is defined the server must also have a role.
_ = multierror.Append(&mErr, fmt.Errorf("Role must be set to create tokens using an entity alias"))
}
return mErr.ErrorOrNil()
}
// lookupSelf is a helper function that looks up latest self lease info.
func (v *vaultClient) lookupSelf() (*vapi.Secret, error) {
// Get the initial lease duration
auth := v.client.Auth().Token()
secret, err := auth.LookupSelf()
if err == nil && secret != nil && secret.Data != nil {
return secret, nil
}
// Try looking up our token directly, even when we get an empty response,
// in case of an unexpected event - a true failure would occur in this lookup again
secret, err = auth.Lookup(v.client.Token())
switch {
case err != nil:
return nil, fmt.Errorf("failed to lookup Vault periodic token: %v", err)
case secret == nil || secret.Data == nil:
return nil, fmt.Errorf("failed to lookup Vault periodic token: got empty response")
default:
return secret, nil
}
}
// getRole returns the role name to be used when creating tokens
func (v *vaultClient) getRole() string {
if v.config.Role != "" {
return v.config.Role
}
return v.tokenData.Role
}
// validateCapabilities checks that Nomad's Vault token has the correct
// capabilities.
func (v *vaultClient) validateCapabilities(role string, root bool) error {
// Check if the token can lookup capabilities.
var mErr multierror.Error
_, _, err := v.hasCapability(vaultCapabilitiesLookupPath, vaultCapabilitiesCapability)
if err != nil {
// Check if there is a permission denied
if structs.VaultUnrecoverableError.MatchString(err.Error()) {
// Since we can't read permissions, we just log a warning that we
// can't tell if the Vault token will work
msg := fmt.Sprintf("can not lookup token capabilities. "+
"As such certain operations may fail in the future. "+
"Please give Nomad a Vault token with one of the following "+
"capabilities %q on %q so that the required capabilities can be verified",
vaultCapabilitiesCapability, vaultCapabilitiesLookupPath)
v.logger.Warn(msg)
return nil
} else {
_ = multierror.Append(&mErr, err)
}
}
// verify is a helper function that verifies the token has one of the
// capabilities on the given path and adds an issue to the error
verify := func(path string, requiredCaps []string) {
ok, caps, err := v.hasCapability(path, requiredCaps)
if err != nil {
_ = multierror.Append(&mErr, err)
} else if !ok {
_ = multierror.Append(&mErr,
fmt.Errorf("token must have one of the following capabilities %q on %q; has %v", requiredCaps, path, caps))
}
}
// Check if we are verifying incoming tokens
if !v.config.AllowsUnauthenticated() {
verify(vaultTokenLookupPath, vaultTokenLookupCapability)
}
// Verify we can renew our selves tokens
verify(vaultTokenRenewPath, vaultTokenRenewCapability)
// Verify we can revoke tokens
verify(vaultTokenRevokePath, vaultTokenRevokeCapability)
// If we are using a role verify the capability
if role != "" {
// Verify we can read the role
verify(fmt.Sprintf(vaultRoleLookupPath, role), vaultRoleLookupCapability)
// Verify we can create from the role
verify(fmt.Sprintf(vaultTokenRoleCreatePath, role), vaultTokenRoleCreateCapability)
}
return mErr.ErrorOrNil()
}
// hasCapability takes a path and returns whether the token has at least one of
// the required capabilities on the given path. It also returns the set of
// capabilities the token does have as well as any error that occurred.
func (v *vaultClient) hasCapability(path string, required []string) (bool, []string, error) {
caps, err := v.client.Sys().CapabilitiesSelf(path)
if err != nil {
return false, nil, err
}
for _, c := range caps {
for _, r := range required {
if c == r {
return true, caps, nil
}
}
}
return false, caps, nil
}
// validateRole contacts Vault and checks that the given Vault role is valid for
// the purposes of being used by Nomad
func (v *vaultClient) validateRole(role string) error {
if role == "" {
return fmt.Errorf("Invalid empty role name")
}
// Validate the role
rsecret, err := v.client.Logical().Read(fmt.Sprintf("auth/token/roles/%s", role))
if err != nil {
return fmt.Errorf("failed to lookup role %q: %v", role, err)
}
if rsecret == nil {
return fmt.Errorf("Role %q does not exist", role)
}
// Read and parse the fields
var data structs.VaultTokenRoleData
if err := structs.DecodeVaultSecretData(rsecret, &data); err != nil {
return fmt.Errorf("failed to parse Vault role's data block: %v", err)
}
// Validate the role is acceptable
var mErr multierror.Error
if !data.Renewable {
_ = multierror.Append(&mErr, fmt.Errorf("Role must allow tokens to be renewed"))
}
if data.ExplicitMaxTtl != 0 || data.TokenExplicitMaxTtl != 0 {
_ = multierror.Append(&mErr, fmt.Errorf("Role can not use an explicit max ttl. Token must be periodic."))
}
if data.Period == 0 && data.TokenPeriod == 0 {
_ = multierror.Append(&mErr, fmt.Errorf("Role must have a non-zero period to make tokens periodic."))
}
// If an entity alias is used, the role must be allowed to use it.
alias := v.config.EntityAlias
if alias != "" && !data.AllowsEntityAlias(alias) {
_ = multierror.Append(&mErr, fmt.Errorf("Role must allow entity alias %s to be used.", alias))
}
return mErr.ErrorOrNil()
}
// ConnectionEstablished returns whether a connection to Vault has been
// established and any error that potentially caused it to be false
func (v *vaultClient) ConnectionEstablished() (bool, error) {
v.l.Lock()
defer v.l.Unlock()
return v.connEstablished, v.connEstablishedErr
}
// Enabled returns whether the client is active
func (v *vaultClient) Enabled() bool {
v.l.Lock()
defer v.l.Unlock()
return v.config.IsEnabled()
}
// Active returns whether the client is active
func (v *vaultClient) Active() bool {
return atomic.LoadInt32(&v.active) == 1
}
// CreateToken takes the allocation and task and returns an appropriate Vault
// token. The call is rate limited and may be canceled with the passed policy.
// When the error is recoverable, it will be of type RecoverableError
func (v *vaultClient) CreateToken(ctx context.Context, a *structs.Allocation, task string) (*vapi.Secret, error) {
if !v.Enabled() {
return nil, fmt.Errorf("Vault integration disabled")
}
if !v.Active() {
return nil, structs.NewRecoverableError(fmt.Errorf("Vault client not active"), true)
}
// Check if we have established a connection with Vault
if established, err := v.ConnectionEstablished(); !established && err == nil {
return nil, structs.NewRecoverableError(fmt.Errorf("Connection to Vault has not been established"), true)
} else if err != nil {
return nil, err
}
// Track how long the request takes
defer metrics.MeasureSince([]string{"nomad", "vault", "create_token"}, time.Now())
// Retrieve the Vault block for the task
vaultBlocks := a.Job.Vault()
if vaultBlocks == nil {
return nil, fmt.Errorf("Job does not require Vault token")
}
tg, ok := vaultBlocks[a.TaskGroup]
if !ok {
return nil, fmt.Errorf("Task group does not require Vault token")
}
taskVault, ok := tg[task]
if !ok {
return nil, fmt.Errorf("Task does not require Vault token")
}
// Set namespace for task
namespaceForTask := v.config.Namespace
if taskVault.Namespace != "" {
namespaceForTask = taskVault.Namespace
}
// Build the creation request
req := &vapi.TokenCreateRequest{
Policies: taskVault.Policies,
Metadata: map[string]string{
"AllocationID": a.ID,
"JobID": a.JobID,
"TaskGroup": a.TaskGroup,
"Task": task,
"NodeID": a.NodeID,
"Namespace": namespaceForTask,
},
TTL: v.childTTL,
DisplayName: fmt.Sprintf("%s-%s", a.ID, task),
}
// If the task defines an entity alias, the Nomad server must have a role
// to be able to derive a token.
role := v.getRole()
if taskVault.EntityAlias != "" {
if role == "" {
return nil, fmt.Errorf("task defines a Vault entity alias, but the Nomad server does not have a Vault role")
}
req.EntityAlias = taskVault.EntityAlias
} else if v.config.EntityAlias != "" {
if role == "" {
return nil, fmt.Errorf("Vault configuration defines an entity alias, but the Nomad server does not have a Vault role")
}
req.EntityAlias = v.config.EntityAlias
}
// Ensure we are under our rate limit
if err := v.limiter.Wait(ctx); err != nil {
return nil, err
}
// Make the request and switch depending on whether we are using a root
// token or a role based token
var secret *vapi.Secret
var err error
// Fetch client for task
taskClient, err := v.entHandler.clientForTask(v, namespaceForTask)
if err != nil {
return nil, err
}
if v.tokenData.Root() && role == "" {
req.Period = v.childTTL
secret, err = taskClient.Auth().Token().Create(req)
} else {
// Make the token using the role
secret, err = taskClient.Auth().Token().CreateWithRole(req, role)
}
// Determine whether it is unrecoverable
if err != nil {
err = fmt.Errorf("failed to create an alloc vault token: %v", err)
if structs.VaultUnrecoverableError.MatchString(err.Error()) {
return secret, err
}
// The error is recoverable
return nil, structs.NewRecoverableError(err, true)
}
// Validate the response
var validationErr error
if secret == nil {
validationErr = fmt.Errorf("Vault returned nil Secret")
} else if secret.WrapInfo == nil {
validationErr = fmt.Errorf("Vault returned Secret with nil WrapInfo. Secret warnings: %v", secret.Warnings)
} else if secret.WrapInfo.WrappedAccessor == "" {
validationErr = fmt.Errorf("Vault returned WrapInfo without WrappedAccessor. Secret warnings: %v", secret.Warnings)
}
if validationErr != nil {
v.logger.Warn("failed to CreateToken", "error", validationErr)
return nil, structs.NewRecoverableError(validationErr, true)
}
// Got a valid response
return secret, nil
}
// LookupToken takes a Vault token and does a lookup against Vault. The call is
// rate limited and may be canceled with passed context.
func (v *vaultClient) LookupToken(ctx context.Context, token string) (*vapi.Secret, error) {
if !v.Enabled() {
return nil, fmt.Errorf("Vault integration disabled")
}
if !v.Active() {
return nil, fmt.Errorf("Vault client not active")
}
// Check if we have established a connection with Vault
if established, err := v.ConnectionEstablished(); !established && err == nil {
return nil, structs.NewRecoverableError(fmt.Errorf("Connection to Vault has not been established"), true)
} else if err != nil {
return nil, err
}
// Track how long the request takes
defer metrics.MeasureSince([]string{"nomad", "vault", "lookup_token"}, time.Now())
// Ensure we are under our rate limit
if err := v.limiter.Wait(ctx); err != nil {
return nil, err
}
// Lookup the token
return v.auth.Lookup(token)
}
// LookupTokenRole takes a Vault token role and does a lookup against Vault.
// The call is rate limited and may be canceled with passed context.
func (v *vaultClient) LookupTokenRole(ctx context.Context, role string) (*vapi.Secret, error) {
if !v.Enabled() {
return nil, fmt.Errorf("Vault integration disabled")
}
if !v.Active() {
return nil, fmt.Errorf("Vault client not active")
}
// Check if we have established a connection with Vault
if established, err := v.ConnectionEstablished(); !established && err == nil {
return nil, structs.NewRecoverableError(fmt.Errorf("Connection to Vault has not been established"), true)
} else if err != nil {
return nil, err
}
// Track how long the request takes
defer metrics.MeasureSince([]string{"nomad", "vault", "lookup_token_role"}, time.Now())
// Ensure we are under our rate limit
if err := v.limiter.Wait(ctx); err != nil {
return nil, err
}
resp, err := v.client.Logical().Read(fmt.Sprintf("auth/token/roles/%s", role))
if err != nil {
return nil, fmt.Errorf("failed to lookup role: %v", err)
}
if resp == nil {
return nil, fmt.Errorf("role %q does not exist", role)
}
return resp, nil
}
// RevokeTokens revokes the passed set of accessors. If committed is set, the
// purge function passed to the client is called. If there is an error purging
// either because of Vault failures or because of the purge function, the
// revocation is retried until the tokens TTL.
func (v *vaultClient) RevokeTokens(ctx context.Context, accessors []*structs.VaultAccessor, committed bool) error {
if !v.Enabled() {
return nil
}
if !v.Active() {
return fmt.Errorf("Vault client not active")
}
// Track how long the request takes
defer metrics.MeasureSince([]string{"nomad", "vault", "revoke_tokens"}, time.Now())
// Check if we have established a connection with Vault. If not just add it
// to the queue
if established, err := v.ConnectionEstablished(); !established && err == nil {
// Only bother tracking it for later revocation if the accessor was
// committed
if committed {
v.storeForRevocation(accessors)
}
// Track that we are abandoning these accessors.
metrics.IncrCounter([]string{"nomad", "vault", "undistributed_tokens_abandoned"}, float32(len(accessors)))
return nil
}
// Attempt to revoke immediately and if it fails, add it to the revoke queue
err := v.parallelRevoke(ctx, accessors)
if err != nil {
// If it is uncommitted, it is a best effort revoke as it will shortly
// TTL within the cubbyhole and has not been leaked to any outside
// system
if !committed {
metrics.IncrCounter([]string{"nomad", "vault", "undistributed_tokens_abandoned"}, float32(len(accessors)))
return nil
}
v.logger.Warn("failed to revoke tokens. Will reattempt until TTL", "error", err)
v.storeForRevocation(accessors)
return nil
} else if !committed {
// Mark that it was revoked but there is nothing to purge so exit
metrics.IncrCounter([]string{"nomad", "vault", "undistributed_tokens_revoked"}, float32(len(accessors)))
return nil
}
if err := v.purgeFn(accessors); err != nil {
v.logger.Error("failed to purge Vault accessors", "error", err)
v.storeForRevocation(accessors)
return nil
}
// Track that it was revoked successfully
metrics.IncrCounter([]string{"nomad", "vault", "distributed_tokens_revoked"}, float32(len(accessors)))
return nil
}
func (v *vaultClient) MarkForRevocation(accessors []*structs.VaultAccessor) error {
if !v.Enabled() {
return nil
}
if !v.Active() {
return fmt.Errorf("Vault client not active")
}
v.storeForRevocation(accessors)
return nil
}
// storeForRevocation stores the passed set of accessors for revocation. It
// captures their effective TTL by storing their create TTL plus the current
// time.
func (v *vaultClient) storeForRevocation(accessors []*structs.VaultAccessor) {
v.revLock.Lock()
now := time.Now()
for _, a := range accessors {
if _, ok := v.revoking[a]; !ok {
v.revoking[a] = now.Add(time.Duration(a.CreationTTL) * time.Second)
}
}
v.revLock.Unlock()
}
// parallelRevoke revokes the passed VaultAccessors in parallel.
func (v *vaultClient) parallelRevoke(ctx context.Context, accessors []*structs.VaultAccessor) error {
if !v.Enabled() {
return fmt.Errorf("Vault integration disabled")
}
if !v.Active() {
return fmt.Errorf("Vault client not active")
}
// Check if we have established a connection with Vault
if established, err := v.ConnectionEstablished(); !established && err == nil {
return structs.NewRecoverableError(fmt.Errorf("Connection to Vault has not been established"), true)
} else if err != nil {
return err
}
g, pCtx := errgroup.WithContext(ctx)
// Cap the handlers
handlers := len(accessors)
if handlers > maxParallelRevokes {
handlers = maxParallelRevokes
}
// Revoke the Vault Token Accessors
input := make(chan *structs.VaultAccessor, handlers)
for i := 0; i < handlers; i++ {
g.Go(func() error {
for {
select {
case va, ok := <-input:
if !ok {
return nil
}
err := v.auth.RevokeAccessor(va.Accessor)
if err != nil && !strings.Contains(err.Error(), "invalid accessor") {
return fmt.Errorf("failed to revoke token (alloc: %q, node: %q, task: %q): %v", va.AllocID, va.NodeID, va.Task, err)
}
case <-pCtx.Done():
return nil
}
}
})
}
// Send the input
go func() {
defer close(input)
for _, va := range accessors {
select {
case <-pCtx.Done():
return
case input <- va:
}
}
}()
// Wait for everything to complete
return g.Wait()
}
// maxVaultRevokeBatchSize is the maximum tokens a revokeDaemon should revoke
// and purge at any given time.
//
// Limiting the revocation batch size is beneficial for few reasons:
// * A single revocation failure of any entry in batch result into retrying the whole batch;
// the larger the batch is the higher likelihood of such failure
// * Smaller batch sizes result into more co-operativeness: provides hooks for
// reconsidering token TTL and leadership steps down.
// * Batches limit the size of the Raft message purging tokens. Due to bugs
// pre-0.11.3, expired tokens were not properly purged, so users upgrading from
// older versions may have huge numbers (millions) of expired tokens to purge.
const maxVaultRevokeBatchSize = 1000
// revokeDaemon should be called in a goroutine and is used to periodically
// revoke Vault accessors that failed the original revocation
func (v *vaultClient) revokeDaemon() {
ticker := time.NewTicker(v.revocationIntv)
defer ticker.Stop()
for {
select {
case <-v.tomb.Dying():
return
case now := <-ticker.C:
if established, err := v.ConnectionEstablished(); !established || err != nil {
continue
}
v.revLock.Lock()
// Fast path
if len(v.revoking) == 0 {
v.revLock.Unlock()
continue
}
// Build the list of accessors that need to be revoked while pruning any TTL'd checks
toRevoke := len(v.revoking)
if toRevoke > v.maxRevokeBatchSize {
v.logger.Info("batching tokens to be revoked",
"to_revoke", toRevoke, "batch_size", v.maxRevokeBatchSize,
"batch_interval", v.revocationIntv)
toRevoke = v.maxRevokeBatchSize
}
revoking := make([]*structs.VaultAccessor, 0, toRevoke)
ttlExpired := []*structs.VaultAccessor{}
for va, ttl := range v.revoking {
if now.After(ttl) {
ttlExpired = append(ttlExpired, va)
} else {
revoking = append(revoking, va)
}
// Batches should consider tokens to be revoked
// as well as expired tokens to ensure the Raft
// message is reasonably sized.
if len(revoking)+len(ttlExpired) >= toRevoke {
break
}
}
if err := v.parallelRevoke(context.Background(), revoking); err != nil {
v.logger.Warn("background token revocation errored", "error", err)
v.revLock.Unlock()
continue
}
// Unlock before a potentially expensive operation
v.revLock.Unlock()
// purge all explicitly revoked as well as ttl expired tokens
// and only remove them locally on purge success
revoking = append(revoking, ttlExpired...)
// Call the passed in token revocation function
if err := v.purgeFn(revoking); err != nil {
// Can continue since revocation is idempotent
v.logger.Error("token revocation errored", "error", err)
continue
}
// Track that tokens were revoked successfully
metrics.IncrCounter([]string{"nomad", "vault", "distributed_tokens_revoked"}, float32(len(revoking)))
// Can delete from the tracked list now that we have purged
v.revLock.Lock()
for _, va := range revoking {
delete(v.revoking, va)
}
v.revLock.Unlock()
}
}
}
// purgeVaultAccessors creates a Raft transaction to remove the passed Vault
// Accessors
func (s *Server) purgeVaultAccessors(accessors []*structs.VaultAccessor) error {
// Commit this update via Raft
req := structs.VaultAccessorsRequest{Accessors: accessors}
_, _, err := s.raftApply(structs.VaultAccessorDeregisterRequestType, req)
return err
}
// wrapNilError is a helper that returns a wrapped function that returns a nil
// error
func wrapNilError(f func()) func() error {
return func() error {
f()
return nil
}
}
// setLimit is used to update the rate limit
func (v *vaultClient) setLimit(l rate.Limit) {
v.l.Lock()
defer v.l.Unlock()
v.limiter = rate.NewLimiter(l, int(l))
}
func (v *vaultClient) Stats() map[string]string {
stat := v.stats()
expireTimeStr := ""
if !stat.TokenExpiry.IsZero() {
expireTimeStr = stat.TokenExpiry.Format(time.RFC3339)
}
return map[string]string{
"tracked_for_revoked": strconv.Itoa(stat.TrackedForRevoke),
"token_ttl": stat.TokenTTL.Round(time.Second).String(),
"token_expire_time": expireTimeStr,
}
}
func (v *vaultClient) stats() *VaultStats {
// Allocate a new stats struct
stats := new(VaultStats)
v.revLock.Lock()
stats.TrackedForRevoke = len(v.revoking)
v.revLock.Unlock()
v.currentExpirationLock.Lock()
stats.TokenExpiry = v.currentExpiration
v.currentExpirationLock.Unlock()
if !stats.TokenExpiry.IsZero() {
stats.TokenTTL = time.Until(stats.TokenExpiry)
}
return stats
}
// EmitStats is used to export metrics about the blocked eval tracker while enabled
func (v *vaultClient) EmitStats(period time.Duration, stopCh <-chan struct{}) {
timer, stop := helper.NewSafeTimer(period)
defer stop()
for {
timer.Reset(period)
select {
case <-timer.C:
stats := v.stats()
metrics.SetGauge([]string{"nomad", "vault", "distributed_tokens_revoking"}, float32(stats.TrackedForRevoke))
metrics.SetGauge([]string{"nomad", "vault", "token_ttl"}, float32(stats.TokenTTL/time.Millisecond))
case <-stopCh:
return
}
}
}
// extendExpiration sets the current auth token expiration record to ttLSeconds seconds from now
func (v *vaultClient) extendExpiration(ttlSeconds int) {
v.currentExpirationLock.Lock()
v.currentExpiration = time.Now().Add(time.Duration(ttlSeconds) * time.Second)
v.currentExpirationLock.Unlock()
}
// VaultVaultNoopDelegate returns the default vault api auth token handler
type VaultNoopDelegate struct{}
func (e *VaultNoopDelegate) clientForTask(v *vaultClient, namespace string) (*vapi.Client, error) {
return v.client, nil
}