client: support graceful shutdowns

Client.Shutdown now blocks until all AllocRunners and TaskRunners have
exited their Run loops. Tasks are left running.
This commit is contained in:
Michael Schurter 2018-11-14 10:29:07 -08:00
parent 4d92603340
commit 5bd744ac3d
15 changed files with 323 additions and 113 deletions

View File

@ -37,7 +37,7 @@ type allocRunner struct {
// stateUpdater is used to emit updated alloc state // stateUpdater is used to emit updated alloc state
stateUpdater cinterfaces.AllocStateHandler stateUpdater cinterfaces.AllocStateHandler
// taskStateUpdateCh is ticked whenever task state as changed. Must // taskStateUpdatedCh is ticked whenever task state as changed. Must
// have len==1 to allow nonblocking notification of state updates while // have len==1 to allow nonblocking notification of state updates while
// the goroutine is already processing a previous update. // the goroutine is already processing a previous update.
taskStateUpdatedCh chan struct{} taskStateUpdatedCh chan struct{}
@ -62,12 +62,12 @@ type allocRunner struct {
// to access. // to access.
destroyed bool destroyed bool
// runLaunched is true if Run() has been called. If this is false // runnersLaunched is true if TaskRunners were Run. Must acquire
// Destroy() does not wait on tasks to shutdown as they are not // destroyedLock to access.
// running. Must acquire destroyedLock to access. runnersLaunched bool
runLaunched bool
// destroyedLock guards destroyed, ran, and serializes Destroy() calls. // destroyedLock guards destroyed, runnersLaunched, and serializes
// Shutdown/Destroy calls.
destroyedLock sync.Mutex destroyedLock sync.Mutex
// Alloc captures the allocation being run. // Alloc captures the allocation being run.
@ -178,38 +178,50 @@ func (ar *allocRunner) WaitCh() <-chan struct{} {
// Run is the main goroutine that executes all the tasks. // Run is the main goroutine that executes all the tasks.
func (ar *allocRunner) Run() { func (ar *allocRunner) Run() {
ar.destroyedLock.Lock() // Close the wait channel on return
defer ar.destroyedLock.Unlock() defer close(ar.waitCh)
// Run should not be called after Destroy is called. This is a // Start the task state update handler
// programming error. go ar.handleTaskStateUpdates()
if ar.destroyed {
ar.logger.Error("alloc destroyed; cannot run")
return
}
// If an alloc should not be run, ensure any restored task handles are // If an alloc should not be run, ensure any restored task handles are
// destroyed and exit to wait for the AR to be GC'd by the client. // destroyed and exit to wait for the AR to be GC'd by the client.
if !ar.shouldRun() { if !ar.shouldRun() {
ar.logger.Debug("not running terminal alloc") ar.logger.Debug("not running terminal alloc")
// Cleanup and sync state // Ensure all tasks are cleaned up
states := ar.killTasks() ar.killTasks()
// Get the client allocation
calloc := ar.clientAlloc(states)
// Update the server
ar.stateUpdater.AllocStateUpdated(calloc)
// Broadcast client alloc to listeners
ar.allocBroadcaster.Send(calloc)
return return
} }
// Run! (and mark as having been run to ensure Destroy cleans up properly) // Mark task runners as being run for Shutdown
ar.runLaunched = true ar.destroyedLock.Lock()
go ar.runImpl() ar.runnersLaunched = true
ar.destroyedLock.Unlock()
// If task update chan has been closed, that means we've been shutdown.
select {
case <-ar.taskStateUpdateHandlerCh:
return
default:
}
// Run the prestart hooks
if err := ar.prerun(); err != nil {
ar.logger.Error("prerun failed", "error", err)
goto POST
}
// Run the runners and block until they exit
<-ar.runTasks()
POST:
// Run the postrun hooks
// XXX Equivalent to TR.Poststop hook
if err := ar.postrun(); err != nil {
ar.logger.Error("postrun failed", "error", err)
}
} }
// shouldRun returns true if the alloc is in a state that the alloc runner // shouldRun returns true if the alloc is in a state that the alloc runner
@ -236,30 +248,6 @@ func (ar *allocRunner) shouldRun() bool {
return true return true
} }
func (ar *allocRunner) runImpl() {
// Close the wait channel on return
defer close(ar.waitCh)
// Start the task state update handler
go ar.handleTaskStateUpdates()
// Run the prestart hooks
if err := ar.prerun(); err != nil {
ar.logger.Error("prerun failed", "error", err)
goto POST
}
// Run the runners and block until they exit
<-ar.runTasks()
POST:
// Run the postrun hooks
// XXX Equivalent to TR.Poststop hook
if err := ar.postrun(); err != nil {
ar.logger.Error("postrun failed", "error", err)
}
}
// runTasks is used to run the task runners. // runTasks is used to run the task runners.
func (ar *allocRunner) runTasks() <-chan struct{} { func (ar *allocRunner) runTasks() <-chan struct{} {
for _, task := range ar.tasks { for _, task := range ar.tasks {
@ -328,7 +316,7 @@ func (ar *allocRunner) TaskStateUpdated() {
} }
// handleTaskStateUpdates must be run in goroutine as it monitors // handleTaskStateUpdates must be run in goroutine as it monitors
// taskStateUpdateCh for task state update notifications and processes task // taskStateUpdatedCh for task state update notifications and processes task
// states. // states.
// //
// Processing task state updates must be done in a goroutine as it may have to // Processing task state updates must be done in a goroutine as it may have to
@ -340,11 +328,13 @@ func (ar *allocRunner) handleTaskStateUpdates() {
select { select {
case <-ar.taskStateUpdatedCh: case <-ar.taskStateUpdatedCh:
case <-ar.waitCh: case <-ar.waitCh:
// Tasks have exited, run once more to ensure final // Run has exited, sync once more to ensure final
// states are collected. // states are collected.
done = true done = true
} }
ar.logger.Trace("handling task state update", "done", done)
// Set with the appropriate event if task runners should be // Set with the appropriate event if task runners should be
// killed. // killed.
var killEvent *structs.TaskEvent var killEvent *structs.TaskEvent
@ -620,12 +610,11 @@ func (ar *allocRunner) Listener() *cstructs.AllocListener {
// exit (thus closing WaitCh). // exit (thus closing WaitCh).
func (ar *allocRunner) Destroy() { func (ar *allocRunner) Destroy() {
ar.destroyedLock.Lock() ar.destroyedLock.Lock()
defer ar.destroyedLock.Unlock()
if ar.destroyed { if ar.destroyed {
// Only destroy once // Only destroy once
ar.destroyedLock.Unlock()
return return
} }
defer ar.destroyedLock.Unlock()
// Stop any running tasks and persist states in case the client is // Stop any running tasks and persist states in case the client is
// shutdown before Destroy finishes. // shutdown before Destroy finishes.
@ -633,10 +622,8 @@ func (ar *allocRunner) Destroy() {
calloc := ar.clientAlloc(states) calloc := ar.clientAlloc(states)
ar.stateUpdater.AllocStateUpdated(calloc) ar.stateUpdater.AllocStateUpdated(calloc)
// Wait for tasks to exit and postrun hooks to finish (if they ran at all) // Wait for tasks to exit and postrun hooks to finish
if ar.runLaunched { <-ar.waitCh
<-ar.waitCh
}
// Run destroy hooks // Run destroy hooks
if err := ar.destroy(); err != nil { if err := ar.destroy(); err != nil {
@ -645,9 +632,7 @@ func (ar *allocRunner) Destroy() {
// Wait for task state update handler to exit before removing local // Wait for task state update handler to exit before removing local
// state if Run() ran at all. // state if Run() ran at all.
if ar.runLaunched { <-ar.taskStateUpdateHandlerCh
<-ar.taskStateUpdateHandlerCh
}
// Cleanup state db // Cleanup state db
if err := ar.stateDB.DeleteAllocationBucket(ar.id); err != nil { if err := ar.stateDB.DeleteAllocationBucket(ar.id); err != nil {
@ -678,6 +663,43 @@ func (ar *allocRunner) IsWaiting() bool {
return ar.prevAllocWatcher.IsWaiting() return ar.prevAllocWatcher.IsWaiting()
} }
// Shutdown AllocRunner gracefully. Blocks while shutting down all TaskRunners.
// Tasks are unaffected and may be restored.
func (ar *allocRunner) Shutdown() {
ar.destroyedLock.Lock()
defer ar.destroyedLock.Unlock()
// Destroy is a superset of Shutdown so there's nothing to do if this
// has already been destroyed.
if ar.destroyed {
return
}
ar.logger.Trace("shutting down")
// Shutdown tasks gracefully if they were run
if ar.runnersLaunched {
wg := sync.WaitGroup{}
for _, tr := range ar.tasks {
wg.Add(1)
go func(tr *taskrunner.TaskRunner) {
tr.Shutdown()
wg.Done()
}(tr)
}
wg.Wait()
}
// Wait for Run to exit
<-ar.waitCh
// Run shutdown hooks
ar.shutdownHooks()
// Wait for updater to finish its final run
<-ar.taskStateUpdateHandlerCh
}
// IsMigrating returns true if the alloc runner is migrating data from its // IsMigrating returns true if the alloc runner is migrating data from its
// previous allocation. // previous allocation.
// //

View File

@ -241,3 +241,27 @@ func (ar *allocRunner) destroy() error {
return nil return nil
} }
// shutdownHooks calls graceful shutdown hooks for when the agent is exiting.
func (ar *allocRunner) shutdownHooks() {
for _, hook := range ar.runnerHooks {
sh, ok := hook.(interfaces.ShutdownHook)
if !ok {
continue
}
name := sh.Name()
var start time.Time
if ar.logger.IsTrace() {
start = time.Now()
ar.logger.Trace("running shutdown hook", "name", name, "start", start)
}
sh.Shutdown()
if ar.logger.IsTrace() {
end := time.Now()
ar.logger.Trace("finished shutdown hooks", "name", name, "end", end, "duration", end.Sub(start))
}
}
}

View File

@ -124,7 +124,7 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {
ar, err := NewAllocRunner(conf) ar, err := NewAllocRunner(conf)
require.NoError(t, err) require.NoError(t, err)
defer ar.Destroy() defer ar.Destroy()
ar.Run() go ar.Run()
// Wait for all tasks to be killed // Wait for all tasks to be killed
upd := conf.StateUpdater.(*MockStateUpdater) upd := conf.StateUpdater.(*MockStateUpdater)
@ -214,7 +214,7 @@ func TestAllocRunner_TaskLeader_StopTG(t *testing.T) {
ar, err := NewAllocRunner(conf) ar, err := NewAllocRunner(conf)
require.NoError(t, err) require.NoError(t, err)
defer ar.Destroy() defer ar.Destroy()
ar.Run() go ar.Run()
// Wait for tasks to start // Wait for tasks to start
upd := conf.StateUpdater.(*MockStateUpdater) upd := conf.StateUpdater.(*MockStateUpdater)
@ -308,7 +308,6 @@ func TestAllocRunner_TaskLeader_StopRestoredTG(t *testing.T) {
ar, err := NewAllocRunner(conf) ar, err := NewAllocRunner(conf)
require.NoError(t, err) require.NoError(t, err)
defer ar.Destroy()
// Mimic Nomad exiting before the leader stopping is able to stop other tasks. // Mimic Nomad exiting before the leader stopping is able to stop other tasks.
ar.tasks["leader"].UpdateState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskKilled)) ar.tasks["leader"].UpdateState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskKilled))

View File

@ -188,6 +188,11 @@ func (h *allocHealthWatcherHook) Destroy() error {
return nil return nil
} }
func (h *allocHealthWatcherHook) Shutdown() {
// Same as Destroy
h.Destroy()
}
// watchHealth watches alloc health until it is set, the alloc is stopped, or // watchHealth watches alloc health until it is set, the alloc is stopped, or
// the context is canceled. watchHealth will be canceled and restarted on // the context is canceled. watchHealth will be canceled and restarted on
// Updates so calls are serialized with a lock. // Updates so calls are serialized with a lock.

View File

@ -23,6 +23,7 @@ import (
var _ interfaces.RunnerPrerunHook = (*allocHealthWatcherHook)(nil) var _ interfaces.RunnerPrerunHook = (*allocHealthWatcherHook)(nil)
var _ interfaces.RunnerUpdateHook = (*allocHealthWatcherHook)(nil) var _ interfaces.RunnerUpdateHook = (*allocHealthWatcherHook)(nil)
var _ interfaces.RunnerDestroyHook = (*allocHealthWatcherHook)(nil) var _ interfaces.RunnerDestroyHook = (*allocHealthWatcherHook)(nil)
var _ interfaces.ShutdownHook = (*allocHealthWatcherHook)(nil)
// allocHealth is emitted to a chan whenever SetHealth is called // allocHealth is emitted to a chan whenever SetHealth is called
type allocHealth struct { type allocHealth struct {

View File

@ -42,3 +42,11 @@ type HookTarget interface {
// State retrieves a copy of the target alloc runners state. // State retrieves a copy of the target alloc runners state.
State() *state.State State() *state.State
} }
// ShutdownHook may be implemented by AllocRunner or TaskRunner hooks and will
// be called when the agent process is being shutdown gracefully.
type ShutdownHook interface {
RunnerHook
Shutdown()
}

View File

@ -23,7 +23,7 @@ type statsHook struct {
updater StatsUpdater updater StatsUpdater
interval time.Duration interval time.Duration
// stopCh is closed by Exited // stopCh is closed by Exited or Canceled
stopCh chan struct{} stopCh chan struct{}
mu sync.Mutex mu sync.Mutex
@ -118,3 +118,19 @@ func (h *statsHook) collectResourceUsageStats(handle interfaces.DriverStats, sto
} }
} }
} }
func (h *statsHook) Shutdown() {
h.mu.Lock()
defer h.mu.Unlock()
if h.stopCh == nil {
return
}
select {
case <-h.stopCh:
// Already closed
default:
close(h.stopCh)
}
}

View File

@ -15,6 +15,7 @@ import (
// Statically assert the stats hook implements the expected interfaces // Statically assert the stats hook implements the expected interfaces
var _ interfaces.TaskPoststartHook = (*statsHook)(nil) var _ interfaces.TaskPoststartHook = (*statsHook)(nil)
var _ interfaces.TaskExitedHook = (*statsHook)(nil) var _ interfaces.TaskExitedHook = (*statsHook)(nil)
var _ interfaces.ShutdownHook = (*statsHook)(nil)
type mockStatsUpdater struct { type mockStatsUpdater struct {
// Ch is sent task resource usage updates if not nil // Ch is sent task resource usage updates if not nil

View File

@ -67,13 +67,16 @@ type TaskRunner struct {
stateUpdater interfaces.TaskStateHandler stateUpdater interfaces.TaskStateHandler
// state captures the state of the task for updating the allocation // state captures the state of the task for updating the allocation
state *structs.TaskState // Must acquire stateLock to access.
stateLock sync.Mutex state *structs.TaskState
// localState captures the node-local state of the task for when the // localState captures the node-local state of the task for when the
// Nomad agent restarts // Nomad agent restarts.
localState *state.LocalState // Must acquire stateLock to access.
localStateLock sync.RWMutex localState *state.LocalState
// stateLock must be acquired when accessing state or localState.
stateLock sync.RWMutex
// stateDB is for persisting localState and taskState // stateDB is for persisting localState and taskState
stateDB cstate.StateDB stateDB cstate.StateDB
@ -498,7 +501,7 @@ func (tr *TaskRunner) runDriver() error {
return fmt.Errorf("driver start failed: %v", err) return fmt.Errorf("driver start failed: %v", err)
} }
tr.localStateLock.Lock() tr.stateLock.Lock()
tr.localState.TaskHandle = handle tr.localState.TaskHandle = handle
tr.localState.DriverNetwork = net tr.localState.DriverNetwork = net
if err := tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState); err != nil { if err := tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState); err != nil {
@ -508,7 +511,7 @@ func (tr *TaskRunner) runDriver() error {
tr.logger.Warn("error persisting local task state; may be unable to restore after a Nomad restart", tr.logger.Warn("error persisting local task state; may be unable to restore after a Nomad restart",
"error", err, "task_id", handle.Config.ID) "error", err, "task_id", handle.Config.ID)
} }
tr.localStateLock.Unlock() tr.stateLock.Unlock()
tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), net)) tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), net))
@ -612,8 +615,8 @@ func (tr *TaskRunner) killTask(handle *DriverHandle) error {
// persistLocalState persists local state to disk synchronously. // persistLocalState persists local state to disk synchronously.
func (tr *TaskRunner) persistLocalState() error { func (tr *TaskRunner) persistLocalState() error {
tr.localStateLock.Lock() tr.stateLock.RLock()
defer tr.localStateLock.Unlock() defer tr.stateLock.RUnlock()
return tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState) return tr.stateDB.PutTaskRunnerLocalState(tr.allocID, tr.taskName, tr.localState)
} }
@ -673,7 +676,12 @@ func (tr *TaskRunner) restoreHandle(taskHandle *drivers.TaskHandle, net *cstruct
} }
if err := tr.driver.RecoverTask(taskHandle); err != nil { if err := tr.driver.RecoverTask(taskHandle); err != nil {
tr.logger.Error("error recovering task; destroying and restarting", if tr.TaskState().State != structs.TaskStateRunning {
// RecoverTask should fail if the Task wasn't running
return
}
tr.logger.Error("error recovering task; cleaning up",
"error", err, "task_id", taskHandle.Config.ID) "error", err, "task_id", taskHandle.Config.ID)
// Try to cleanup any existing task state in the plugin before restarting // Try to cleanup any existing task state in the plugin before restarting
@ -846,8 +854,21 @@ func (tr *TaskRunner) WaitCh() <-chan struct{} {
// This method is safe for calling concurrently with Run() and does not modify // This method is safe for calling concurrently with Run() and does not modify
// the passed in allocation. // the passed in allocation.
func (tr *TaskRunner) Update(update *structs.Allocation) { func (tr *TaskRunner) Update(update *structs.Allocation) {
task := update.LookupTask(tr.taskName)
if task == nil {
// This should not happen and likely indicates a bug in the
// server or client.
tr.logger.Error("allocation update is missing task; killing",
"group", update.TaskGroup)
te := structs.NewTaskEvent(structs.TaskKilled).
SetKillReason("update missing task").
SetFailsTask()
tr.Kill(context.Background(), te)
return
}
// Update tr.alloc // Update tr.alloc
tr.setAlloc(update) tr.setAlloc(update, task)
// Trigger update hooks if not terminal // Trigger update hooks if not terminal
if !update.TerminalStatus() { if !update.TerminalStatus() {
@ -868,6 +889,21 @@ func (tr *TaskRunner) triggerUpdateHooks() {
} }
} }
// Shutdown TaskRunner gracefully without affecting the state of the task.
// Shutdown blocks until the main Run loop exits.
func (tr *TaskRunner) Shutdown() {
tr.logger.Trace("shutting down")
tr.ctxCancel()
<-tr.WaitCh()
// Run shutdown hooks to cleanup
tr.shutdownHooks()
// Persist once more
tr.persistLocalState()
}
// LatestResourceUsage returns the last resource utilization datapoint // LatestResourceUsage returns the last resource utilization datapoint
// collected. May return nil if the task is not running or no resource // collected. May return nil if the task is not running or no resource
// utilization has been collected yet. // utilization has been collected yet.

View File

@ -10,10 +10,16 @@ func (tr *TaskRunner) Alloc() *structs.Allocation {
return tr.alloc return tr.alloc
} }
func (tr *TaskRunner) setAlloc(updated *structs.Allocation) { // setAlloc and task on TaskRunner
func (tr *TaskRunner) setAlloc(updated *structs.Allocation, task *structs.Task) {
tr.allocLock.Lock() tr.allocLock.Lock()
defer tr.allocLock.Unlock()
tr.taskLock.Lock()
defer tr.taskLock.Unlock()
tr.alloc = updated tr.alloc = updated
tr.allocLock.Unlock() tr.task = task
} }
// IsLeader returns true if this task is the leader of its task group. // IsLeader returns true if this task is the leader of its task group.

View File

@ -101,11 +101,11 @@ func (tr *TaskRunner) prestart() error {
} }
var origHookState *state.HookState var origHookState *state.HookState
tr.localStateLock.RLock() tr.stateLock.RLock()
if tr.localState.Hooks != nil { if tr.localState.Hooks != nil {
origHookState = tr.localState.Hooks[name] origHookState = tr.localState.Hooks[name]
} }
tr.localStateLock.RUnlock() tr.stateLock.RUnlock()
if origHookState != nil && origHookState.PrestartDone { if origHookState != nil && origHookState.PrestartDone {
tr.logger.Trace("skipping done prestart hook", "name", pre.Name()) tr.logger.Trace("skipping done prestart hook", "name", pre.Name())
continue continue
@ -135,9 +135,9 @@ func (tr *TaskRunner) prestart() error {
// Store and persist local state if the hook state has changed // Store and persist local state if the hook state has changed
if !hookState.Equal(origHookState) { if !hookState.Equal(origHookState) {
tr.localStateLock.Lock() tr.stateLock.Lock()
tr.localState.Hooks[name] = hookState tr.localState.Hooks[name] = hookState
tr.localStateLock.Unlock() tr.stateLock.Unlock()
if err := tr.persistLocalState(); err != nil { if err := tr.persistLocalState(); err != nil {
return err return err
@ -360,12 +360,12 @@ func (tr *TaskRunner) killing() {
} }
for _, hook := range tr.runnerHooks { for _, hook := range tr.runnerHooks {
upd, ok := hook.(interfaces.TaskKillHook) killHook, ok := hook.(interfaces.TaskKillHook)
if !ok { if !ok {
continue continue
} }
name := upd.Name() name := killHook.Name()
// Time the update hook // Time the update hook
var start time.Time var start time.Time
@ -374,10 +374,10 @@ func (tr *TaskRunner) killing() {
tr.logger.Trace("running kill hook", "name", name, "start", start) tr.logger.Trace("running kill hook", "name", name, "start", start)
} }
// Run the update hook // Run the kill hook
req := interfaces.TaskKillRequest{} req := interfaces.TaskKillRequest{}
var resp interfaces.TaskKillResponse var resp interfaces.TaskKillResponse
if err := upd.Killing(context.Background(), &req, &resp); err != nil { if err := killHook.Killing(context.Background(), &req, &resp); err != nil {
tr.logger.Error("kill hook failed", "name", name, "error", err) tr.logger.Error("kill hook failed", "name", name, "error", err)
} }
@ -389,3 +389,30 @@ func (tr *TaskRunner) killing() {
} }
} }
} }
// shutdownHooks is called when the TaskRunner is gracefully shutdown but the
// task is not being stopped or garbage collected.
func (tr *TaskRunner) shutdownHooks() {
for _, hook := range tr.runnerHooks {
sh, ok := hook.(interfaces.ShutdownHook)
if !ok {
continue
}
name := sh.Name()
// Time the update hook
var start time.Time
if tr.logger.IsTrace() {
start = time.Now()
tr.logger.Trace("running shutdown hook", "name", name, "start", start)
}
sh.Shutdown()
if tr.logger.IsTrace() {
end := time.Now()
tr.logger.Trace("finished shutdown hook", "name", name, "end", end, "duration", end.Sub(start))
}
}
}

View File

@ -127,8 +127,7 @@ func TestTaskRunner_Restore_Running(t *testing.T) {
}) })
// Cause TR to exit without shutting down task // Cause TR to exit without shutting down task
origTR.ctxCancel() origTR.Shutdown()
<-origTR.WaitCh()
// Start a new TaskRunner and make sure it does not rerun the task // Start a new TaskRunner and make sure it does not rerun the task
newTR, err := NewTaskRunner(conf) newTR, err := NewTaskRunner(conf)

View File

@ -162,6 +162,10 @@ func (h *vaultHook) Stop(ctx context.Context, req *interfaces.TaskStopRequest, r
return nil return nil
} }
func (h *vaultHook) Shutdown() {
h.cancel()
}
// run should be called in a go-routine and manages the derivation, renewal and // run should be called in a go-routine and manages the derivation, renewal and
// handling of errors with the Vault token. The optional parameter allows // handling of errors with the Vault token. The optional parameter allows
// setting the initial Vault token. This is useful when the Vault token is // setting the initial Vault token. This is useful when the Vault token is

View File

@ -0,0 +1,8 @@
package taskrunner
import "github.com/hashicorp/nomad/client/allocrunner/interfaces"
// Statically assert the stats hook implements the expected interfaces
var _ interfaces.TaskPrestartHook = (*vaultHook)(nil)
var _ interfaces.TaskStopHook = (*vaultHook)(nil)
var _ interfaces.ShutdownHook = (*vaultHook)(nil)

View File

@ -106,6 +106,7 @@ type AllocRunner interface {
Alloc() *structs.Allocation Alloc() *structs.Allocation
AllocState() *arstate.State AllocState() *arstate.State
Destroy() Destroy()
Shutdown()
GetAllocDir() *allocdir.AllocDir GetAllocDir() *allocdir.AllocDir
IsDestroyed() bool IsDestroyed() bool
IsMigrating() bool IsMigrating() bool
@ -186,10 +187,19 @@ type Client struct {
// HostStatsCollector collects host resource usage stats // HostStatsCollector collects host resource usage stats
hostStatsCollector *stats.HostStatsCollector hostStatsCollector *stats.HostStatsCollector
shutdown bool // shutdown is true when the Client has been shutdown. Must hold
shutdownCh chan struct{} // shutdownLock to access.
shutdown bool
// shutdownCh is closed to signal the Client is shutting down.
shutdownCh chan struct{}
shutdownLock sync.Mutex shutdownLock sync.Mutex
// shutdownGroup are goroutines that exit when shutdownCh is closed.
// Shutdown() blocks on Wait() after closing shutdownCh.
shutdownGroup group
// vaultClient is used to interact with Vault for token and secret renewals // vaultClient is used to interact with Vault for token and secret renewals
vaultClient vaultclient.VaultClient vaultClient vaultclient.VaultClient
@ -332,7 +342,7 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic
// Setup Consul discovery if enabled // Setup Consul discovery if enabled
if c.configCopy.ConsulConfig.ClientAutoJoin != nil && *c.configCopy.ConsulConfig.ClientAutoJoin { if c.configCopy.ConsulConfig.ClientAutoJoin != nil && *c.configCopy.ConsulConfig.ClientAutoJoin {
go c.consulDiscovery() c.shutdownGroup.Go(c.consulDiscovery)
if c.servers.NumServers() == 0 { if c.servers.NumServers() == 0 {
// No configured servers; trigger discovery manually // No configured servers; trigger discovery manually
c.triggerDiscoveryCh <- struct{}{} c.triggerDiscoveryCh <- struct{}{}
@ -359,19 +369,21 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic
} }
// Register and then start heartbeating to the servers. // Register and then start heartbeating to the servers.
go c.registerAndHeartbeat() c.shutdownGroup.Go(c.registerAndHeartbeat)
// Begin periodic snapshotting of state. // Begin periodic snapshotting of state.
go c.periodicSnapshot() c.shutdownGroup.Go(c.periodicSnapshot)
// Begin syncing allocations to the server // Begin syncing allocations to the server
go c.allocSync() c.shutdownGroup.Go(c.allocSync)
// Start the client! // Start the client! Don't use the shutdownGroup as run handles
// shutdowns manually to prevent updates from being applied during
// shutdown.
go c.run() go c.run()
// Start collecting stats // Start collecting stats
go c.emitStats() c.shutdownGroup.Go(c.emitStats)
c.logger.Info("started client", "node_id", c.NodeID()) c.logger.Info("started client", "node_id", c.NodeID())
return c, nil return c, nil
@ -533,20 +545,14 @@ func (c *Client) RPCMinorVersion() int {
// Shutdown is used to tear down the client // Shutdown is used to tear down the client
func (c *Client) Shutdown() error { func (c *Client) Shutdown() error {
c.logger.Info("shutting down")
c.shutdownLock.Lock() c.shutdownLock.Lock()
defer c.shutdownLock.Unlock() defer c.shutdownLock.Unlock()
if c.shutdown { if c.shutdown {
c.logger.Info("already shutdown")
return nil return nil
} }
c.logger.Info("shutting down")
// Defer closing the database
defer func() {
if err := c.stateDB.Close(); err != nil {
c.logger.Error("error closing state database on shutdown", "error", err)
}
}()
// Shutdown the device manager // Shutdown the device manager
c.devicemanager.Shutdown() c.devicemanager.Shutdown()
@ -559,20 +565,39 @@ func (c *Client) Shutdown() error {
// Stop Garbage collector // Stop Garbage collector
c.garbageCollector.Stop() c.garbageCollector.Stop()
// Destroy all the running allocations.
if c.config.DevMode { if c.config.DevMode {
// In DevMode destroy all the running allocations.
for _, ar := range c.getAllocRunners() { for _, ar := range c.getAllocRunners() {
ar.Destroy() ar.Destroy()
} }
for _, ar := range c.getAllocRunners() { for _, ar := range c.getAllocRunners() {
<-ar.WaitCh() <-ar.WaitCh()
} }
} else {
// In normal mode call shutdown
wg := sync.WaitGroup{}
for _, ar := range c.getAllocRunners() {
wg.Add(1)
go func(ar AllocRunner) {
ar.Shutdown()
wg.Done()
}(ar)
}
wg.Wait()
} }
c.shutdown = true c.shutdown = true
close(c.shutdownCh) close(c.shutdownCh)
// Must close connection pool to unblock alloc watcher
c.connPool.Shutdown() c.connPool.Shutdown()
return nil
// Wait for goroutines to stop
c.shutdownGroup.Wait()
// One final save state
c.saveState()
return c.stateDB.Close()
} }
// Stats is used to return statistics for debugging and insight // Stats is used to return statistics for debugging and insight
@ -829,7 +854,7 @@ func (c *Client) restoreState() error {
// All allocs restored successfully, run them! // All allocs restored successfully, run them!
c.allocLock.Lock() c.allocLock.Lock()
for _, ar := range c.allocs { for _, ar := range c.allocs {
ar.Run() go ar.Run()
} }
c.allocLock.Unlock() c.allocLock.Unlock()
@ -1194,10 +1219,10 @@ func (c *Client) registerAndHeartbeat() {
c.retryRegisterNode() c.retryRegisterNode()
// Start watching changes for node changes // Start watching changes for node changes
go c.watchNodeUpdates() c.shutdownGroup.Go(c.watchNodeUpdates)
// Start watching for emitting node events // Start watching for emitting node events
go c.watchNodeEvents() c.shutdownGroup.Go(c.watchNodeEvents)
// Setup the heartbeat timer, for the initial registration // Setup the heartbeat timer, for the initial registration
// we want to do this quickly. We want to do it extra quickly // we want to do this quickly. We want to do it extra quickly
@ -1311,7 +1336,7 @@ func (c *Client) periodicSnapshot() {
} }
} }
// run is a long lived goroutine used to run the client // run is a long lived goroutine used to run the client. Shutdown() stops it first
func (c *Client) run() { func (c *Client) run() {
// Watch for changes in allocations // Watch for changes in allocations
allocUpdates := make(chan *allocUpdates, 8) allocUpdates := make(chan *allocUpdates, 8)
@ -1320,7 +1345,17 @@ func (c *Client) run() {
for { for {
select { select {
case update := <-allocUpdates: case update := <-allocUpdates:
// Don't apply updates while shutting down.
c.shutdownLock.Lock()
if c.shutdown {
c.shutdownLock.Unlock()
return
}
// Apply updates inside lock to prevent a concurrent
// shutdown.
c.runAllocs(update) c.runAllocs(update)
c.shutdownLock.Unlock()
case <-c.shutdownCh: case <-c.shutdownCh:
return return
@ -1785,6 +1820,7 @@ OUTER:
pulled: pulledAllocs, pulled: pulledAllocs,
migrateTokens: resp.MigrateTokens, migrateTokens: resp.MigrateTokens,
} }
select { select {
case updates <- update: case updates <- update:
case <-c.shutdownCh: case <-c.shutdownCh:
@ -1974,7 +2010,7 @@ func (c *Client) addAlloc(alloc *structs.Allocation, migrateToken string) error
// Store the alloc runner. // Store the alloc runner.
c.allocs[alloc.ID] = ar c.allocs[alloc.ID] = ar
ar.Run() go ar.Run()
return nil return nil
} }
@ -2561,3 +2597,21 @@ func (c *Client) allAllocs() map[string]*structs.Allocation {
} }
return allocs return allocs
} }
// group wraps a func() in a goroutine and provides a way to block until it
// exits. Inspired by https://godoc.org/golang.org/x/sync/errgroup
type group struct {
wg sync.WaitGroup
}
func (g *group) Go(f func()) {
g.wg.Add(1)
go func() {
defer g.wg.Done()
f()
}()
}
func (g *group) Wait() {
g.wg.Wait()
}