2023-04-10 15:36:59 +00:00
|
|
|
// Copyright (c) HashiCorp, Inc.
|
|
|
|
// SPDX-License-Identifier: MPL-2.0
|
|
|
|
|
2020-04-13 20:08:24 +00:00
|
|
|
package client
|
|
|
|
|
|
|
|
import (
|
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
hclog "github.com/hashicorp/go-hclog"
|
|
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
|
|
)
|
|
|
|
|
|
|
|
type heartbeatStop struct {
|
|
|
|
lastOk time.Time
|
2020-04-28 20:13:09 +00:00
|
|
|
startupGrace time.Time
|
2020-04-13 20:08:24 +00:00
|
|
|
allocInterval map[string]time.Duration
|
|
|
|
allocHookCh chan *structs.Allocation
|
|
|
|
getRunner func(string) (AllocRunner, error)
|
|
|
|
logger hclog.InterceptLogger
|
|
|
|
shutdownCh chan struct{}
|
|
|
|
lock *sync.RWMutex
|
|
|
|
}
|
|
|
|
|
|
|
|
func newHeartbeatStop(
|
|
|
|
getRunner func(string) (AllocRunner, error),
|
2020-04-28 20:13:09 +00:00
|
|
|
timeout time.Duration,
|
2020-04-13 20:08:24 +00:00
|
|
|
logger hclog.InterceptLogger,
|
|
|
|
shutdownCh chan struct{}) *heartbeatStop {
|
|
|
|
|
|
|
|
h := &heartbeatStop{
|
2020-04-28 20:13:09 +00:00
|
|
|
startupGrace: time.Now().Add(timeout),
|
2020-04-13 20:08:24 +00:00
|
|
|
allocInterval: make(map[string]time.Duration),
|
|
|
|
allocHookCh: make(chan *structs.Allocation),
|
|
|
|
getRunner: getRunner,
|
|
|
|
logger: logger,
|
|
|
|
shutdownCh: shutdownCh,
|
|
|
|
lock: &sync.RWMutex{},
|
|
|
|
}
|
|
|
|
|
|
|
|
return h
|
|
|
|
}
|
|
|
|
|
|
|
|
// allocHook is called after (re)storing a new AllocRunner in the client. It registers the
|
|
|
|
// allocation to be stopped if the taskgroup is configured appropriately
|
|
|
|
func (h *heartbeatStop) allocHook(alloc *structs.Allocation) {
|
|
|
|
tg := allocTaskGroup(alloc)
|
|
|
|
if tg.StopAfterClientDisconnect != nil {
|
|
|
|
h.allocHookCh <- alloc
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// shouldStop is called on a restored alloc to determine if lastOk is sufficiently in the
|
|
|
|
// past that it should be prevented from restarting
|
|
|
|
func (h *heartbeatStop) shouldStop(alloc *structs.Allocation) bool {
|
|
|
|
tg := allocTaskGroup(alloc)
|
|
|
|
if tg.StopAfterClientDisconnect != nil {
|
2020-04-28 20:13:09 +00:00
|
|
|
return h.shouldStopAfter(time.Now(), *tg.StopAfterClientDisconnect)
|
2020-04-13 20:08:24 +00:00
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2020-04-28 20:13:09 +00:00
|
|
|
func (h *heartbeatStop) shouldStopAfter(now time.Time, interval time.Duration) bool {
|
|
|
|
lastOk := h.getLastOk()
|
|
|
|
if lastOk.IsZero() {
|
2020-06-03 13:48:38 +00:00
|
|
|
return now.After(h.startupGrace)
|
2020-04-28 20:13:09 +00:00
|
|
|
}
|
|
|
|
return now.After(lastOk.Add(interval))
|
|
|
|
}
|
|
|
|
|
2020-04-13 20:08:24 +00:00
|
|
|
// watch is a loop that checks for allocations that should be stopped. It also manages the
|
|
|
|
// registration of allocs to be stopped in a single thread.
|
|
|
|
func (h *heartbeatStop) watch() {
|
|
|
|
// If we never manage to successfully contact the server, we want to stop our allocs
|
|
|
|
// after duration + start time
|
2022-08-16 16:41:08 +00:00
|
|
|
h.setLastOk(time.Now())
|
2020-04-13 20:08:24 +00:00
|
|
|
stop := make(chan string, 1)
|
|
|
|
var now time.Time
|
|
|
|
var interval time.Duration
|
|
|
|
checkAllocs := false
|
|
|
|
|
|
|
|
for {
|
|
|
|
// minimize the interval
|
|
|
|
interval = 5 * time.Second
|
|
|
|
for _, t := range h.allocInterval {
|
|
|
|
if t < interval {
|
|
|
|
interval = t
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
checkAllocs = false
|
|
|
|
timeout := time.After(interval)
|
|
|
|
|
|
|
|
select {
|
|
|
|
case allocID := <-stop:
|
|
|
|
if err := h.stopAlloc(allocID); err != nil {
|
2020-05-13 20:39:04 +00:00
|
|
|
h.logger.Warn("error stopping on heartbeat timeout", "alloc", allocID, "error", err)
|
2020-04-13 20:08:24 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
delete(h.allocInterval, allocID)
|
|
|
|
|
|
|
|
case alloc := <-h.allocHookCh:
|
|
|
|
tg := allocTaskGroup(alloc)
|
|
|
|
if tg.StopAfterClientDisconnect != nil {
|
|
|
|
h.allocInterval[alloc.ID] = *tg.StopAfterClientDisconnect
|
|
|
|
}
|
|
|
|
|
|
|
|
case <-timeout:
|
|
|
|
checkAllocs = true
|
|
|
|
|
|
|
|
case <-h.shutdownCh:
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
if !checkAllocs {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
now = time.Now()
|
|
|
|
for allocID, d := range h.allocInterval {
|
2020-04-28 20:13:09 +00:00
|
|
|
if h.shouldStopAfter(now, d) {
|
2020-04-13 20:08:24 +00:00
|
|
|
stop <- allocID
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// setLastOk sets the last known good heartbeat time to the current time, and persists that time to disk
|
2020-04-28 20:13:09 +00:00
|
|
|
func (h *heartbeatStop) setLastOk(t time.Time) {
|
2020-04-13 20:08:24 +00:00
|
|
|
h.lock.Lock()
|
|
|
|
defer h.lock.Unlock()
|
|
|
|
h.lastOk = t
|
|
|
|
}
|
|
|
|
|
|
|
|
func (h *heartbeatStop) getLastOk() time.Time {
|
|
|
|
h.lock.RLock()
|
|
|
|
defer h.lock.RUnlock()
|
|
|
|
return h.lastOk
|
|
|
|
}
|
|
|
|
|
|
|
|
// stopAlloc actually stops the allocation
|
|
|
|
func (h *heartbeatStop) stopAlloc(allocID string) error {
|
|
|
|
runner, err := h.getRunner(allocID)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2020-05-13 20:39:04 +00:00
|
|
|
h.logger.Debug("stopping alloc for stop_after_client_disconnect", "alloc", allocID)
|
|
|
|
|
2020-04-13 20:08:24 +00:00
|
|
|
runner.Destroy()
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func allocTaskGroup(alloc *structs.Allocation) *structs.TaskGroup {
|
|
|
|
for _, tg := range alloc.Job.TaskGroups {
|
|
|
|
if tg.Name == alloc.TaskGroup {
|
|
|
|
return tg
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|