64449cddc1
Most allocation hooks don't need to know when a single task within the allocation is restarted. The check watcher for group services triggers the alloc runner to restart all tasks, but the alloc runner's `Restart` method doesn't trigger any of the alloc hooks, including the group service hook. The result is that after the first time a check triggers a restart, we'll never restart the tasks of an allocation again. This commit adds a `RunnerTaskRestartHook` interface so that alloc runner hooks can act if a task within the alloc is restarted. The only implementation is in the group service hook, which will force a re-registration of the alloc's services and fix check restarts.
232 lines
5.9 KiB
Go
232 lines
5.9 KiB
Go
package allocrunner
|
|
|
|
import (
|
|
"sync"
|
|
"time"
|
|
|
|
log "github.com/hashicorp/go-hclog"
|
|
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
|
|
"github.com/hashicorp/nomad/client/consul"
|
|
"github.com/hashicorp/nomad/client/taskenv"
|
|
agentconsul "github.com/hashicorp/nomad/command/agent/consul"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
)
|
|
|
|
type networkStatusGetter interface {
|
|
NetworkStatus() *structs.AllocNetworkStatus
|
|
}
|
|
|
|
// groupServiceHook manages task group Consul service registration and
|
|
// deregistration.
|
|
type groupServiceHook struct {
|
|
allocID string
|
|
group string
|
|
restarter agentconsul.WorkloadRestarter
|
|
consulClient consul.ConsulServiceAPI
|
|
prerun bool
|
|
delay time.Duration
|
|
deregistered bool
|
|
networkStatusGetter networkStatusGetter
|
|
|
|
logger log.Logger
|
|
|
|
// The following fields may be updated
|
|
canary bool
|
|
services []*structs.Service
|
|
networks structs.Networks
|
|
ports structs.AllocatedPorts
|
|
taskEnvBuilder *taskenv.Builder
|
|
|
|
// Since Update() may be called concurrently with any other hook all
|
|
// hook methods must be fully serialized
|
|
mu sync.Mutex
|
|
}
|
|
|
|
type groupServiceHookConfig struct {
|
|
alloc *structs.Allocation
|
|
consul consul.ConsulServiceAPI
|
|
restarter agentconsul.WorkloadRestarter
|
|
taskEnvBuilder *taskenv.Builder
|
|
networkStatusGetter networkStatusGetter
|
|
logger log.Logger
|
|
}
|
|
|
|
func newGroupServiceHook(cfg groupServiceHookConfig) *groupServiceHook {
|
|
var shutdownDelay time.Duration
|
|
tg := cfg.alloc.Job.LookupTaskGroup(cfg.alloc.TaskGroup)
|
|
|
|
if tg.ShutdownDelay != nil {
|
|
shutdownDelay = *tg.ShutdownDelay
|
|
}
|
|
|
|
h := &groupServiceHook{
|
|
allocID: cfg.alloc.ID,
|
|
group: cfg.alloc.TaskGroup,
|
|
restarter: cfg.restarter,
|
|
consulClient: cfg.consul,
|
|
taskEnvBuilder: cfg.taskEnvBuilder,
|
|
delay: shutdownDelay,
|
|
networkStatusGetter: cfg.networkStatusGetter,
|
|
}
|
|
h.logger = cfg.logger.Named(h.Name())
|
|
h.services = cfg.alloc.Job.LookupTaskGroup(h.group).Services
|
|
|
|
if cfg.alloc.AllocatedResources != nil {
|
|
h.networks = cfg.alloc.AllocatedResources.Shared.Networks
|
|
h.ports = cfg.alloc.AllocatedResources.Shared.Ports
|
|
}
|
|
|
|
if cfg.alloc.DeploymentStatus != nil {
|
|
h.canary = cfg.alloc.DeploymentStatus.Canary
|
|
}
|
|
return h
|
|
}
|
|
|
|
func (*groupServiceHook) Name() string {
|
|
return "group_services"
|
|
}
|
|
|
|
func (h *groupServiceHook) Prerun() error {
|
|
h.mu.Lock()
|
|
defer func() {
|
|
// Mark prerun as true to unblock Updates
|
|
h.prerun = true
|
|
h.mu.Unlock()
|
|
}()
|
|
return h.prerunLocked()
|
|
}
|
|
|
|
func (h *groupServiceHook) prerunLocked() error {
|
|
if len(h.services) == 0 {
|
|
return nil
|
|
}
|
|
|
|
services := h.getWorkloadServices()
|
|
return h.consulClient.RegisterWorkload(services)
|
|
}
|
|
|
|
func (h *groupServiceHook) Update(req *interfaces.RunnerUpdateRequest) error {
|
|
h.mu.Lock()
|
|
defer h.mu.Unlock()
|
|
|
|
oldWorkloadServices := h.getWorkloadServices()
|
|
|
|
// Store new updated values out of request
|
|
canary := false
|
|
if req.Alloc.DeploymentStatus != nil {
|
|
canary = req.Alloc.DeploymentStatus.Canary
|
|
}
|
|
|
|
var networks structs.Networks
|
|
if req.Alloc.AllocatedResources != nil {
|
|
networks = req.Alloc.AllocatedResources.Shared.Networks
|
|
h.ports = req.Alloc.AllocatedResources.Shared.Ports
|
|
}
|
|
|
|
tg := req.Alloc.Job.LookupTaskGroup(h.group)
|
|
var shutdown time.Duration
|
|
if tg.ShutdownDelay != nil {
|
|
shutdown = *tg.ShutdownDelay
|
|
}
|
|
|
|
// Update group service hook fields
|
|
h.networks = networks
|
|
h.services = tg.Services
|
|
h.canary = canary
|
|
h.delay = shutdown
|
|
h.taskEnvBuilder.UpdateTask(req.Alloc, nil)
|
|
|
|
// Create new task services struct with those new values
|
|
newWorkloadServices := h.getWorkloadServices()
|
|
|
|
if !h.prerun {
|
|
// Update called before Prerun. Update alloc and exit to allow
|
|
// Prerun to do initial registration.
|
|
return nil
|
|
}
|
|
|
|
return h.consulClient.UpdateWorkload(oldWorkloadServices, newWorkloadServices)
|
|
}
|
|
|
|
func (h *groupServiceHook) PreTaskRestart() error {
|
|
h.mu.Lock()
|
|
defer func() {
|
|
// Mark prerun as true to unblock Updates
|
|
h.prerun = true
|
|
h.mu.Unlock()
|
|
}()
|
|
|
|
h.preKillLocked()
|
|
return h.prerunLocked()
|
|
}
|
|
|
|
func (h *groupServiceHook) PreKill() {
|
|
h.mu.Lock()
|
|
defer h.mu.Unlock()
|
|
h.preKillLocked()
|
|
}
|
|
|
|
// implements the PreKill hook but requires the caller hold the lock
|
|
func (h *groupServiceHook) preKillLocked() {
|
|
// If we have a shutdown delay deregister
|
|
// group services and then wait
|
|
// before continuing to kill tasks
|
|
h.deregister()
|
|
h.deregistered = true
|
|
|
|
if h.delay == 0 {
|
|
return
|
|
}
|
|
|
|
h.logger.Debug("waiting before removing group service", "shutdown_delay", h.delay)
|
|
|
|
// Wait for specified shutdown_delay
|
|
// this will block an agent from shutting down
|
|
<-time.After(h.delay)
|
|
}
|
|
|
|
func (h *groupServiceHook) Postrun() error {
|
|
h.mu.Lock()
|
|
defer h.mu.Unlock()
|
|
|
|
if !h.deregistered {
|
|
h.deregister()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// deregister services from Consul.
|
|
func (h *groupServiceHook) deregister() {
|
|
if len(h.services) > 0 {
|
|
workloadServices := h.getWorkloadServices()
|
|
h.consulClient.RemoveWorkload(workloadServices)
|
|
|
|
// Canary flag may be getting flipped when the alloc is being
|
|
// destroyed, so remove both variations of the service
|
|
workloadServices.Canary = !workloadServices.Canary
|
|
h.consulClient.RemoveWorkload(workloadServices)
|
|
}
|
|
}
|
|
|
|
func (h *groupServiceHook) getWorkloadServices() *agentconsul.WorkloadServices {
|
|
// Interpolate with the task's environment
|
|
interpolatedServices := taskenv.InterpolateServices(h.taskEnvBuilder.Build(), h.services)
|
|
|
|
var netStatus *structs.AllocNetworkStatus
|
|
if h.networkStatusGetter != nil {
|
|
netStatus = h.networkStatusGetter.NetworkStatus()
|
|
}
|
|
|
|
// Create task services struct with request's driver metadata
|
|
return &agentconsul.WorkloadServices{
|
|
AllocID: h.allocID,
|
|
Group: h.group,
|
|
Restarter: h.restarter,
|
|
Services: interpolatedServices,
|
|
Networks: h.networks,
|
|
NetworkStatus: netStatus,
|
|
Ports: h.ports,
|
|
Canary: h.canary,
|
|
}
|
|
}
|