d3c4700cd3
* jobspec, api: add stop_after_client_disconnect * nomad/state/state_store: error message typo * structs: alloc methods to support stop_after_client_disconnect 1. a global AllocStates to track status changes with timestamps. We need this to track the time at which the alloc became lost originally. 2. ShouldClientStop() and WaitClientStop() to actually do the math * scheduler/reconcile_util: delayByStopAfterClientDisconnect * scheduler/reconcile: use delayByStopAfterClientDisconnect * scheduler/util: updateNonTerminalAllocsToLost comments This was setup to only update allocs to lost if the DesiredStatus had already been set by the scheduler. It seems like the intention was to update the status from any non-terminal state, and not all lost allocs have been marked stop or evict by now * scheduler/testing: AssertEvalStatus just use require * scheduler/generic_sched: don't create a blocked eval if delayed * scheduler/generic_sched_test: several scheduling cases
159 lines
3.8 KiB
Go
159 lines
3.8 KiB
Go
package client
|
|
|
|
import (
|
|
"sync"
|
|
"time"
|
|
|
|
hclog "github.com/hashicorp/go-hclog"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
)
|
|
|
|
type heartbeatStop struct {
|
|
lastOk time.Time
|
|
startupGrace time.Time
|
|
allocInterval map[string]time.Duration
|
|
allocHookCh chan *structs.Allocation
|
|
getRunner func(string) (AllocRunner, error)
|
|
logger hclog.InterceptLogger
|
|
shutdownCh chan struct{}
|
|
lock *sync.RWMutex
|
|
}
|
|
|
|
func newHeartbeatStop(
|
|
getRunner func(string) (AllocRunner, error),
|
|
timeout time.Duration,
|
|
logger hclog.InterceptLogger,
|
|
shutdownCh chan struct{}) *heartbeatStop {
|
|
|
|
h := &heartbeatStop{
|
|
startupGrace: time.Now().Add(timeout),
|
|
allocInterval: make(map[string]time.Duration),
|
|
allocHookCh: make(chan *structs.Allocation),
|
|
getRunner: getRunner,
|
|
logger: logger,
|
|
shutdownCh: shutdownCh,
|
|
lock: &sync.RWMutex{},
|
|
}
|
|
|
|
return h
|
|
}
|
|
|
|
// allocHook is called after (re)storing a new AllocRunner in the client. It registers the
|
|
// allocation to be stopped if the taskgroup is configured appropriately
|
|
func (h *heartbeatStop) allocHook(alloc *structs.Allocation) {
|
|
tg := allocTaskGroup(alloc)
|
|
if tg.StopAfterClientDisconnect != nil {
|
|
h.allocHookCh <- alloc
|
|
}
|
|
}
|
|
|
|
// shouldStop is called on a restored alloc to determine if lastOk is sufficiently in the
|
|
// past that it should be prevented from restarting
|
|
func (h *heartbeatStop) shouldStop(alloc *structs.Allocation) bool {
|
|
tg := allocTaskGroup(alloc)
|
|
if tg.StopAfterClientDisconnect != nil {
|
|
return h.shouldStopAfter(time.Now(), *tg.StopAfterClientDisconnect)
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (h *heartbeatStop) shouldStopAfter(now time.Time, interval time.Duration) bool {
|
|
lastOk := h.getLastOk()
|
|
if lastOk.IsZero() {
|
|
return h.startupGrace.After(now)
|
|
}
|
|
return now.After(lastOk.Add(interval))
|
|
}
|
|
|
|
// watch is a loop that checks for allocations that should be stopped. It also manages the
|
|
// registration of allocs to be stopped in a single thread.
|
|
func (h *heartbeatStop) watch() {
|
|
// If we never manage to successfully contact the server, we want to stop our allocs
|
|
// after duration + start time
|
|
h.lastOk = time.Now()
|
|
stop := make(chan string, 1)
|
|
var now time.Time
|
|
var interval time.Duration
|
|
checkAllocs := false
|
|
|
|
for {
|
|
// minimize the interval
|
|
interval = 5 * time.Second
|
|
for _, t := range h.allocInterval {
|
|
if t < interval {
|
|
interval = t
|
|
}
|
|
}
|
|
|
|
checkAllocs = false
|
|
timeout := time.After(interval)
|
|
|
|
select {
|
|
case allocID := <-stop:
|
|
if err := h.stopAlloc(allocID); err != nil {
|
|
h.logger.Warn("error stopping on heartbeat timeout", "alloc", allocID, "error", err)
|
|
continue
|
|
}
|
|
delete(h.allocInterval, allocID)
|
|
|
|
case alloc := <-h.allocHookCh:
|
|
tg := allocTaskGroup(alloc)
|
|
if tg.StopAfterClientDisconnect != nil {
|
|
h.allocInterval[alloc.ID] = *tg.StopAfterClientDisconnect
|
|
}
|
|
|
|
case <-timeout:
|
|
checkAllocs = true
|
|
|
|
case <-h.shutdownCh:
|
|
return
|
|
}
|
|
|
|
if !checkAllocs {
|
|
continue
|
|
}
|
|
|
|
now = time.Now()
|
|
for allocID, d := range h.allocInterval {
|
|
if h.shouldStopAfter(now, d) {
|
|
stop <- allocID
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// setLastOk sets the last known good heartbeat time to the current time, and persists that time to disk
|
|
func (h *heartbeatStop) setLastOk(t time.Time) {
|
|
h.lock.Lock()
|
|
defer h.lock.Unlock()
|
|
h.lastOk = t
|
|
}
|
|
|
|
func (h *heartbeatStop) getLastOk() time.Time {
|
|
h.lock.RLock()
|
|
defer h.lock.RUnlock()
|
|
return h.lastOk
|
|
}
|
|
|
|
// stopAlloc actually stops the allocation
|
|
func (h *heartbeatStop) stopAlloc(allocID string) error {
|
|
runner, err := h.getRunner(allocID)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
h.logger.Debug("stopping alloc for stop_after_client_disconnect", "alloc", allocID)
|
|
|
|
runner.Destroy()
|
|
return nil
|
|
}
|
|
|
|
func allocTaskGroup(alloc *structs.Allocation) *structs.TaskGroup {
|
|
for _, tg := range alloc.Job.TaskGroups {
|
|
if tg.Name == alloc.TaskGroup {
|
|
return tg
|
|
}
|
|
}
|
|
return nil
|
|
}
|