057f117592
Re-orient the management of the tr.kill to happen in the parent of the spawned goroutine that is doing the actual token derivation. This makes the code a little more straightforward, making it easier to reason about not leaking the worker goroutine.
276 lines
7.8 KiB
Go
276 lines
7.8 KiB
Go
package taskrunner
|
|
|
|
import (
|
|
"context"
|
|
"io/ioutil"
|
|
"os"
|
|
"path/filepath"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/hashicorp/go-hclog"
|
|
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
|
|
ti "github.com/hashicorp/nomad/client/allocrunner/taskrunner/interfaces"
|
|
"github.com/hashicorp/nomad/client/consul"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
"github.com/pkg/errors"
|
|
)
|
|
|
|
const (
|
|
// the name of this hook, used in logs
|
|
sidsHookName = "consul_si_token"
|
|
|
|
// sidsBackoffBaseline is the baseline time for exponential backoff when
|
|
// attempting to retrieve a Consul SI token
|
|
sidsBackoffBaseline = 5 * time.Second
|
|
|
|
// sidsBackoffLimit is the limit of the exponential backoff when attempting
|
|
// to retrieve a Consul SI token
|
|
sidsBackoffLimit = 3 * time.Minute
|
|
|
|
// sidsDerivationTimeout limits the amount of time we may spend trying to
|
|
// derive a SI token. If the hook does not get a token within this amount of
|
|
// time, the result is a failure.
|
|
sidsDerivationTimeout = 5 * time.Minute
|
|
|
|
// sidsTokenFile is the name of the file holding the Consul SI token inside
|
|
// the task's secret directory
|
|
sidsTokenFile = "si_token"
|
|
|
|
// sidsTokenFilePerms is the level of file permissions granted on the file
|
|
// in the secrets directory for the task
|
|
sidsTokenFilePerms = 0440
|
|
)
|
|
|
|
type sidsHookConfig struct {
|
|
alloc *structs.Allocation
|
|
task *structs.Task
|
|
sidsClient consul.ServiceIdentityAPI
|
|
lifecycle ti.TaskLifecycle
|
|
logger hclog.Logger
|
|
}
|
|
|
|
// Service Identities hook for managing SI tokens of connect enabled tasks.
|
|
type sidsHook struct {
|
|
// alloc is the allocation
|
|
alloc *structs.Allocation
|
|
|
|
// taskName is the name of the task
|
|
task *structs.Task
|
|
|
|
// sidsClient is the Consul client [proxy] for requesting SI tokens
|
|
sidsClient consul.ServiceIdentityAPI
|
|
|
|
// lifecycle is used to signal, restart, and kill a task
|
|
lifecycle ti.TaskLifecycle
|
|
|
|
// derivationTimeout is the amount of time we may wait for Consul to successfully
|
|
// provide a SI token. Making this configurable for testing, otherwise
|
|
// default to sidsDerivationTimeout
|
|
derivationTimeout time.Duration
|
|
|
|
// logger is used to log
|
|
logger hclog.Logger
|
|
|
|
// lock variables that can be manipulated after hook creation
|
|
lock sync.Mutex
|
|
// firstRun keeps track of whether the hook is being called for the first
|
|
// time (for this task) during the lifespan of the Nomad Client process.
|
|
firstRun bool
|
|
}
|
|
|
|
func newSIDSHook(c sidsHookConfig) *sidsHook {
|
|
return &sidsHook{
|
|
alloc: c.alloc,
|
|
task: c.task,
|
|
sidsClient: c.sidsClient,
|
|
lifecycle: c.lifecycle,
|
|
derivationTimeout: sidsDerivationTimeout,
|
|
logger: c.logger.Named(sidsHookName),
|
|
firstRun: true,
|
|
}
|
|
}
|
|
|
|
func (h *sidsHook) Name() string {
|
|
return sidsHookName
|
|
}
|
|
|
|
func (h *sidsHook) Prestart(
|
|
ctx context.Context,
|
|
req *interfaces.TaskPrestartRequest,
|
|
resp *interfaces.TaskPrestartResponse) error {
|
|
|
|
h.lock.Lock()
|
|
defer h.lock.Unlock()
|
|
|
|
// do nothing if we have already done things
|
|
if h.earlyExit() {
|
|
resp.Done = true
|
|
return nil
|
|
}
|
|
|
|
// optimistically try to recover token from disk
|
|
token, err := h.recoverToken(req.TaskDir.SecretsDir)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// need to ask for a new SI token & persist it to disk
|
|
if token == "" {
|
|
if token, err = h.deriveSIToken(ctx); err != nil {
|
|
return err
|
|
}
|
|
if err := h.writeToken(req.TaskDir.SecretsDir, token); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
h.logger.Info("derived SI token", "task", h.task.Name, "si_task", h.task.Kind.Value())
|
|
|
|
resp.Done = true
|
|
return nil
|
|
}
|
|
|
|
// earlyExit returns true if the Prestart hook has already been executed during
|
|
// the instantiation of this task runner.
|
|
//
|
|
// assumes h is locked
|
|
func (h *sidsHook) earlyExit() bool {
|
|
if h.firstRun {
|
|
h.firstRun = false
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
// writeToken writes token into the secrets directory for the task.
|
|
func (h *sidsHook) writeToken(dir string, token string) error {
|
|
tokenPath := filepath.Join(dir, sidsTokenFile)
|
|
if err := ioutil.WriteFile(tokenPath, []byte(token), sidsTokenFilePerms); err != nil {
|
|
return errors.Wrap(err, "failed to write SI token")
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// recoverToken returns the token saved to disk in the secrets directory for the
|
|
// task if it exists, or the empty string if the file does not exist. an error
|
|
// is returned only for some other (e.g. disk IO) error.
|
|
func (h *sidsHook) recoverToken(dir string) (string, error) {
|
|
tokenPath := filepath.Join(dir, sidsTokenFile)
|
|
token, err := ioutil.ReadFile(tokenPath)
|
|
if err != nil {
|
|
if !os.IsNotExist(err) {
|
|
h.logger.Error("failed to recover SI token", "error", err)
|
|
return "", errors.Wrap(err, "failed to recover SI token")
|
|
}
|
|
h.logger.Trace("no pre-existing SI token to recover", "task", h.task.Name)
|
|
return "", nil // token file does not exist yet
|
|
}
|
|
h.logger.Trace("recovered pre-existing SI token", "task", h.task.Name)
|
|
return string(token), nil
|
|
}
|
|
|
|
// siDerivationResult is used to pass along the result of attempting to derive
|
|
// an SI token between the goroutine doing the derivation and its caller
|
|
type siDerivationResult struct {
|
|
token string
|
|
err error
|
|
}
|
|
|
|
// deriveSIToken spawns and waits on a goroutine which will make attempts to
|
|
// derive an SI token until a token is successfully created, or ctx is signaled
|
|
// done.
|
|
func (h *sidsHook) deriveSIToken(ctx context.Context) (string, error) {
|
|
ctx2, cancel := context.WithTimeout(ctx, h.derivationTimeout)
|
|
defer cancel()
|
|
|
|
resultCh := make(chan siDerivationResult)
|
|
|
|
// keep trying to get the token in the background
|
|
go h.tryDerive(ctx2, resultCh)
|
|
|
|
// wait until we get a token, or we get a signal to quit
|
|
for {
|
|
select {
|
|
case result := <-resultCh:
|
|
if result.err != nil {
|
|
h.kill(ctx, errors.Wrap(result.err, "consul: failed to derive SI token"))
|
|
return "", result.err
|
|
}
|
|
return result.token, nil
|
|
case <-ctx2.Done():
|
|
return "", ctx2.Err()
|
|
}
|
|
}
|
|
}
|
|
|
|
func (h *sidsHook) kill(ctx context.Context, err error) {
|
|
_ = h.lifecycle.Kill(
|
|
ctx,
|
|
structs.NewTaskEvent(structs.TaskKilling).
|
|
SetFailsTask().
|
|
SetDisplayMessage(err.Error()),
|
|
)
|
|
}
|
|
|
|
// tryDerive loops forever until a token is created, or ctx is done.
|
|
func (h *sidsHook) tryDerive(ctx context.Context, ch chan<- siDerivationResult) {
|
|
for attempt := 0; backoff(ctx, attempt); attempt++ {
|
|
|
|
tokens, err := h.sidsClient.DeriveSITokens(h.alloc, []string{h.task.Name})
|
|
|
|
switch {
|
|
case err == nil:
|
|
token, exists := tokens[h.task.Name]
|
|
if !exists {
|
|
err := errors.New("response does not include token for task")
|
|
h.logger.Error("derive SI token is missing token for task", "error", err, "task", h.task.Name)
|
|
ch <- siDerivationResult{token: "", err: err}
|
|
return
|
|
}
|
|
ch <- siDerivationResult{token: token, err: nil}
|
|
return
|
|
case structs.IsServerSide(err):
|
|
// the error is known to be a server problem, just die
|
|
h.logger.Error("failed to derive SI token", "error", err, "task", h.task.Name, "server_side", true)
|
|
ch <- siDerivationResult{token: "", err: err}
|
|
return
|
|
case !structs.IsRecoverable(err):
|
|
// the error is known not to be recoverable, just die
|
|
h.logger.Error("failed to derive SI token", "error", err, "task", h.task.Name, "recoverable", false)
|
|
ch <- siDerivationResult{token: "", err: err}
|
|
return
|
|
|
|
default:
|
|
// the error is marked recoverable, retry after some backoff
|
|
h.logger.Error("failed attempt to derive SI token", "error", err, "recoverable", true)
|
|
}
|
|
}
|
|
}
|
|
|
|
func backoff(ctx context.Context, attempt int) bool {
|
|
next := computeBackoff(attempt)
|
|
select {
|
|
case <-ctx.Done():
|
|
return false
|
|
case <-time.After(next):
|
|
return true
|
|
}
|
|
}
|
|
|
|
func computeBackoff(attempt int) time.Duration {
|
|
switch attempt {
|
|
case 0:
|
|
return 0
|
|
case 1:
|
|
// go fast on first retry, because a unit test should be fast
|
|
return 100 * time.Millisecond
|
|
default:
|
|
wait := time.Duration(attempt) * sidsBackoffBaseline
|
|
if wait > sidsBackoffLimit {
|
|
wait = sidsBackoffLimit
|
|
}
|
|
return wait
|
|
}
|
|
}
|