open-nomad/client/allocrunner/taskrunner/vault_hook.go

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

412 lines
11 KiB
Go
Raw Normal View History

2018-07-12 23:15:33 +00:00
package taskrunner
import (
"context"
"fmt"
"os"
"path/filepath"
"sync"
"time"
"github.com/hashicorp/consul-template/signals"
log "github.com/hashicorp/go-hclog"
2018-10-04 23:22:01 +00:00
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
ti "github.com/hashicorp/nomad/client/allocrunner/taskrunner/interfaces"
2018-07-12 23:15:33 +00:00
"github.com/hashicorp/nomad/client/vaultclient"
"github.com/hashicorp/nomad/nomad/structs"
)
const (
// vaultBackoffBaseline is the baseline time for exponential backoff when
// attempting to retrieve a Vault token
vaultBackoffBaseline = 5 * time.Second
// vaultBackoffLimit is the limit of the exponential backoff when attempting
// to retrieve a Vault token
vaultBackoffLimit = 3 * time.Minute
// vaultTokenFile is the name of the file holding the Vault token inside the
// task's secret directory
vaultTokenFile = "vault_token"
)
type vaultTokenUpdateHandler interface {
updatedVaultToken(token string)
}
func (tr *TaskRunner) updatedVaultToken(token string) {
// Update the task runner and environment
2018-07-12 23:15:33 +00:00
tr.setVaultToken(token)
// Trigger update hooks with the new Vault token
tr.triggerUpdateHooks()
2018-07-12 23:15:33 +00:00
}
type vaultHookConfig struct {
vaultBlock *structs.Vault
client vaultclient.VaultClient
events ti.EventEmitter
lifecycle ti.TaskLifecycle
updater vaultTokenUpdateHandler
logger log.Logger
alloc *structs.Allocation
task string
2018-07-12 23:15:33 +00:00
}
type vaultHook struct {
// vaultBlock is the vault block for the task
vaultBlock *structs.Vault
2018-07-12 23:15:33 +00:00
// eventEmitter is used to emit events to the task
2018-07-13 20:45:57 +00:00
eventEmitter ti.EventEmitter
2018-07-12 23:15:33 +00:00
// lifecycle is used to signal, restart and kill a task
2018-07-13 20:45:57 +00:00
lifecycle ti.TaskLifecycle
2018-07-12 23:15:33 +00:00
// updater is used to update the Vault token
updater vaultTokenUpdateHandler
// client is the Vault client to retrieve and renew the Vault token
client vaultclient.VaultClient
// logger is used to log
logger log.Logger
// ctx and cancel are used to kill the long running token manager
ctx context.Context
cancel context.CancelFunc
// tokenPath is the path in which to read and write the token
tokenPath string
// alloc is the allocation
alloc *structs.Allocation
// taskName is the name of the task
taskName string
// firstRun stores whether it is the first run for the hook
firstRun bool
// future is used to wait on retrieving a Vault token
future *tokenFuture
}
func newVaultHook(config *vaultHookConfig) *vaultHook {
ctx, cancel := context.WithCancel(context.Background())
h := &vaultHook{
vaultBlock: config.vaultBlock,
2018-07-12 23:15:33 +00:00
client: config.client,
eventEmitter: config.events,
lifecycle: config.lifecycle,
updater: config.updater,
alloc: config.alloc,
taskName: config.task,
firstRun: true,
ctx: ctx,
cancel: cancel,
future: newTokenFuture(),
}
h.logger = config.logger.Named(h.Name())
return h
}
func (*vaultHook) Name() string {
return "vault"
}
2018-07-17 00:19:56 +00:00
func (h *vaultHook) Prestart(ctx context.Context, req *interfaces.TaskPrestartRequest, resp *interfaces.TaskPrestartResponse) error {
// If we have already run prestart before exit early. We do not use the
// PrestartDone value because we want to recover the token on restoration.
2018-07-12 23:15:33 +00:00
first := h.firstRun
h.firstRun = false
if !first {
return nil
}
// Try to recover a token if it was previously written in the secrets
// directory
recoveredToken := ""
h.tokenPath = filepath.Join(req.TaskDir.SecretsDir, vaultTokenFile)
data, err := os.ReadFile(h.tokenPath)
2018-07-12 23:15:33 +00:00
if err != nil {
if !os.IsNotExist(err) {
return fmt.Errorf("failed to recover vault token: %v", err)
}
// Token file doesn't exist
} else {
// Store the recovered token
recoveredToken = string(data)
}
// Launch the token manager
go h.run(recoveredToken)
// Block until we get a token
select {
case <-h.future.Wait():
case <-ctx.Done():
return nil
}
h.updater.updatedVaultToken(h.future.Get())
return nil
}
2018-07-17 00:19:56 +00:00
func (h *vaultHook) Stop(ctx context.Context, req *interfaces.TaskStopRequest, resp *interfaces.TaskStopResponse) error {
2018-07-12 23:15:33 +00:00
// Shutdown any created manager
h.cancel()
return nil
}
func (h *vaultHook) Shutdown() {
h.cancel()
}
2018-07-12 23:15:33 +00:00
// run should be called in a go-routine and manages the derivation, renewal and
// handling of errors with the Vault token. The optional parameter allows
// setting the initial Vault token. This is useful when the Vault token is
// recovered off disk.
func (h *vaultHook) run(token string) {
// Helper for stopping token renewal
stopRenewal := func() {
if err := h.client.StopRenewToken(h.future.Get()); err != nil {
h.logger.Warn("failed to stop token renewal", "error", err)
}
}
// updatedToken lets us store state between loops. If true, a new token
// has been retrieved and we need to apply the Vault change mode
var updatedToken bool
OUTER:
for {
// Check if we should exit
2018-07-13 16:45:29 +00:00
if h.ctx.Err() != nil {
2018-07-12 23:15:33 +00:00
stopRenewal()
return
}
// Clear the token
h.future.Clear()
// Check if there already is a token which can be the case for
// restoring the TaskRunner
if token == "" {
// Get a token
var exit bool
token, exit = h.deriveVaultToken()
if exit {
// Exit the manager
return
}
// Write the token to disk
if err := h.writeToken(token); err != nil {
errorString := "failed to write Vault token to disk"
h.logger.Error(errorString, "error", err)
h.lifecycle.Kill(h.ctx,
structs.NewTaskEvent(structs.TaskKilling).
SetFailsTask().
SetDisplayMessage(fmt.Sprintf("Vault %v", errorString)))
2018-07-12 23:15:33 +00:00
return
}
}
// Start the renewal process.
//
// This is the initial renew of the token which we derived from the
// server. The client does not know how long it took for the token to
// be generated and derived and also wants to gain control of the
// process quickly, but not too quickly. We therefore use a hardcoded
// increment value of 30; this value without a suffix is in seconds.
//
// If Vault is having availability issues or is overloaded, a large
// number of initial token renews can exacerbate the problem.
2018-07-12 23:15:33 +00:00
renewCh, err := h.client.RenewToken(token, 30)
// An error returned means the token is not being renewed
if err != nil {
h.logger.Error("failed to start renewal of Vault token", "error", err)
token = ""
goto OUTER
}
// The Vault token is valid now, so set it
h.future.Set(token)
if updatedToken {
switch h.vaultBlock.ChangeMode {
2018-07-12 23:15:33 +00:00
case structs.VaultChangeModeSignal:
s, err := signals.Parse(h.vaultBlock.ChangeSignal)
2018-07-12 23:15:33 +00:00
if err != nil {
h.logger.Error("failed to parse signal", "error", err)
h.lifecycle.Kill(h.ctx,
structs.NewTaskEvent(structs.TaskKilling).
SetFailsTask().
SetDisplayMessage(fmt.Sprintf("Vault: failed to parse signal: %v", err)))
2018-07-12 23:15:33 +00:00
return
}
event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetDisplayMessage("Vault: new Vault token acquired")
if err := h.lifecycle.Signal(event, h.vaultBlock.ChangeSignal); err != nil {
2018-07-12 23:15:33 +00:00
h.logger.Error("failed to send signal", "error", err)
h.lifecycle.Kill(h.ctx,
structs.NewTaskEvent(structs.TaskKilling).
SetFailsTask().
SetDisplayMessage(fmt.Sprintf("Vault: failed to send signal: %v", err)))
2018-07-12 23:15:33 +00:00
return
}
case structs.VaultChangeModeRestart:
const noFailure = false
h.lifecycle.Restart(h.ctx,
structs.NewTaskEvent(structs.TaskRestartSignal).
SetDisplayMessage("Vault: new Vault token acquired"), false)
2018-07-12 23:15:33 +00:00
case structs.VaultChangeModeNoop:
fallthrough
default:
h.logger.Error("invalid Vault change mode", "mode", h.vaultBlock.ChangeMode)
2018-07-12 23:15:33 +00:00
}
// We have handled it
updatedToken = false
// Call the handler
h.updater.updatedVaultToken(token)
}
// Start watching for renewal errors
select {
case err := <-renewCh:
// Clear the token
token = ""
h.logger.Error("failed to renew Vault token", "error", err)
stopRenewal()
Un-break templates when using vault stanza change_mode noop (#11783) Templates in nomad jobs make use of the vault token defined in the vault stanza when issuing credentials like client certificates. When using change_mode "noop" in the vault stanza, consul-template is not informed in case a vault token is re-issued (which can happen from time to time for various reasons, as described in https://www.nomadproject.io/docs/job-specification/vault). As a result, consul-template will keep using the old vault token to renew credentials and - once the token expired - stop renewing credentials. The symptom of this problem is a vault_token file that is newer than the issued credential (e.g., TLS certificate) in a job's /secrets directory. This change corrects this, so that h.updater.updatedVaultToken(token) is called, which will inform stakeholders about the new token and make sure, the new token is used by consul-template. Example job template fragment: vault { policies = ["nomad-job-policy"] change_mode = "noop" } template { data = <<-EOH {{ with secret "pki_int/issue/nomad-job" "common_name=myjob.service.consul" "ttl=90m" "alt_names=localhost" "ip_sans=127.0.0.1"}} {{ .Data.certificate }} {{ .Data.private_key }} {{ .Data.issuing_ca }} {{ end }} EOH destination = "${NOMAD_SECRETS_DIR}/myjob.crt" change_mode = "noop" } This fix does not alter the meaning of the three change modes of vault - "noop" - Take no action - "restart" - Restart the job - "signal" - send a signal to the task as the switch statement following line 232 contains the necessary logic. It is assumed that "take no action" was never meant to mean "don't tell consul-template about the new vault token". Successfully tested in a staging cluster consisting of multiple nomad client nodes.
2022-01-10 19:41:38 +00:00
updatedToken = true
2018-07-12 23:15:33 +00:00
case <-h.ctx.Done():
stopRenewal()
return
}
}
}
// deriveVaultToken derives the Vault token using exponential backoffs. It
// returns the Vault token and whether the manager should exit.
func (h *vaultHook) deriveVaultToken() (token string, exit bool) {
attempts := 0
for {
tokens, err := h.client.DeriveToken(h.alloc, []string{h.taskName})
if err == nil {
return tokens[h.taskName], false
}
// Check if this is a server side error
if structs.IsServerSide(err) {
h.logger.Error("failed to derive Vault token", "error", err, "server_side", true)
h.lifecycle.Kill(h.ctx,
structs.NewTaskEvent(structs.TaskKilling).
SetFailsTask().
SetDisplayMessage(fmt.Sprintf("Vault: server failed to derive vault token: %v", err)))
2018-07-12 23:15:33 +00:00
return "", true
}
// Check if we can't recover from the error
if !structs.IsRecoverable(err) {
h.logger.Error("failed to derive Vault token", "error", err, "recoverable", false)
h.lifecycle.Kill(h.ctx,
structs.NewTaskEvent(structs.TaskKilling).
SetFailsTask().
SetDisplayMessage(fmt.Sprintf("Vault: failed to derive vault token: %v", err)))
2018-07-12 23:15:33 +00:00
return "", true
}
// Handle the retry case
backoff := (1 << (2 * uint64(attempts))) * vaultBackoffBaseline
if backoff > vaultBackoffLimit {
backoff = vaultBackoffLimit
}
h.logger.Error("failed to derive Vault token", "error", err, "recoverable", true, "backoff", backoff)
attempts++
// Wait till retrying
select {
case <-h.ctx.Done():
return "", true
case <-time.After(backoff):
}
}
}
// writeToken writes the given token to disk
func (h *vaultHook) writeToken(token string) error {
if err := os.WriteFile(h.tokenPath, []byte(token), 0666); err != nil {
2018-07-12 23:15:33 +00:00
return fmt.Errorf("failed to write vault token: %v", err)
}
return nil
}
// tokenFuture stores the Vault token and allows consumers to block till a valid
// token exists
type tokenFuture struct {
waiting []chan struct{}
token string
set bool
m sync.Mutex
}
// newTokenFuture returns a new token future without any token set
func newTokenFuture() *tokenFuture {
return &tokenFuture{}
}
// Wait returns a channel that can be waited on. When this channel unblocks, a
// valid token will be available via the Get method
func (f *tokenFuture) Wait() <-chan struct{} {
f.m.Lock()
defer f.m.Unlock()
c := make(chan struct{})
if f.set {
close(c)
return c
}
f.waiting = append(f.waiting, c)
return c
}
// Set sets the token value and unblocks any caller of Wait
func (f *tokenFuture) Set(token string) *tokenFuture {
f.m.Lock()
defer f.m.Unlock()
f.set = true
f.token = token
for _, w := range f.waiting {
close(w)
}
f.waiting = nil
return f
}
// Clear clears the set vault token.
func (f *tokenFuture) Clear() *tokenFuture {
f.m.Lock()
defer f.m.Unlock()
f.token = ""
f.set = false
return f
}
// Get returns the set Vault token
func (f *tokenFuture) Get() string {
f.m.Lock()
defer f.m.Unlock()
return f.token
}