2019-08-14 22:02:00 +00:00
|
|
|
package allocrunner
|
|
|
|
|
|
|
|
import (
|
|
|
|
"sync"
|
2019-11-18 16:16:25 +00:00
|
|
|
"time"
|
2019-08-14 22:02:00 +00:00
|
|
|
|
|
|
|
log "github.com/hashicorp/go-hclog"
|
|
|
|
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
|
|
|
|
"github.com/hashicorp/nomad/client/consul"
|
2019-11-18 18:04:01 +00:00
|
|
|
"github.com/hashicorp/nomad/client/taskenv"
|
|
|
|
agentconsul "github.com/hashicorp/nomad/command/agent/consul"
|
2019-08-14 22:02:00 +00:00
|
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
|
|
)
|
|
|
|
|
2020-10-15 19:32:21 +00:00
|
|
|
type networkStatusGetter interface {
|
|
|
|
NetworkStatus() *structs.AllocNetworkStatus
|
|
|
|
}
|
|
|
|
|
2019-08-14 22:02:00 +00:00
|
|
|
// groupServiceHook manages task group Consul service registration and
|
|
|
|
// deregistration.
|
|
|
|
type groupServiceHook struct {
|
2020-10-15 19:32:21 +00:00
|
|
|
allocID string
|
|
|
|
group string
|
|
|
|
restarter agentconsul.WorkloadRestarter
|
|
|
|
consulClient consul.ConsulServiceAPI
|
|
|
|
prerun bool
|
|
|
|
delay time.Duration
|
|
|
|
deregistered bool
|
|
|
|
networkStatusGetter networkStatusGetter
|
2019-08-14 22:02:00 +00:00
|
|
|
|
|
|
|
logger log.Logger
|
2019-11-18 18:04:01 +00:00
|
|
|
|
|
|
|
// The following fields may be updated
|
|
|
|
canary bool
|
|
|
|
services []*structs.Service
|
|
|
|
networks structs.Networks
|
2020-10-15 19:32:21 +00:00
|
|
|
ports structs.AllocatedPorts
|
2019-11-18 18:04:01 +00:00
|
|
|
taskEnvBuilder *taskenv.Builder
|
|
|
|
|
|
|
|
// Since Update() may be called concurrently with any other hook all
|
|
|
|
// hook methods must be fully serialized
|
|
|
|
mu sync.Mutex
|
|
|
|
}
|
|
|
|
|
|
|
|
type groupServiceHookConfig struct {
|
2020-10-15 19:32:21 +00:00
|
|
|
alloc *structs.Allocation
|
|
|
|
consul consul.ConsulServiceAPI
|
|
|
|
restarter agentconsul.WorkloadRestarter
|
|
|
|
taskEnvBuilder *taskenv.Builder
|
|
|
|
networkStatusGetter networkStatusGetter
|
|
|
|
logger log.Logger
|
2019-08-14 22:02:00 +00:00
|
|
|
}
|
|
|
|
|
2019-11-18 18:04:01 +00:00
|
|
|
func newGroupServiceHook(cfg groupServiceHookConfig) *groupServiceHook {
|
2019-11-18 16:16:25 +00:00
|
|
|
var shutdownDelay time.Duration
|
|
|
|
tg := cfg.alloc.Job.LookupTaskGroup(cfg.alloc.TaskGroup)
|
|
|
|
|
2019-12-04 16:06:01 +00:00
|
|
|
if tg.ShutdownDelay != nil {
|
2019-11-18 16:16:25 +00:00
|
|
|
shutdownDelay = *tg.ShutdownDelay
|
|
|
|
}
|
|
|
|
|
2019-08-14 22:02:00 +00:00
|
|
|
h := &groupServiceHook{
|
2020-10-15 19:32:21 +00:00
|
|
|
allocID: cfg.alloc.ID,
|
|
|
|
group: cfg.alloc.TaskGroup,
|
|
|
|
restarter: cfg.restarter,
|
|
|
|
consulClient: cfg.consul,
|
|
|
|
taskEnvBuilder: cfg.taskEnvBuilder,
|
|
|
|
delay: shutdownDelay,
|
|
|
|
networkStatusGetter: cfg.networkStatusGetter,
|
2019-11-18 18:04:01 +00:00
|
|
|
}
|
|
|
|
h.logger = cfg.logger.Named(h.Name())
|
|
|
|
h.services = cfg.alloc.Job.LookupTaskGroup(h.group).Services
|
|
|
|
|
|
|
|
if cfg.alloc.AllocatedResources != nil {
|
|
|
|
h.networks = cfg.alloc.AllocatedResources.Shared.Networks
|
2020-10-15 19:32:21 +00:00
|
|
|
h.ports = cfg.alloc.AllocatedResources.Shared.Ports
|
2019-11-18 18:04:01 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if cfg.alloc.DeploymentStatus != nil {
|
|
|
|
h.canary = cfg.alloc.DeploymentStatus.Canary
|
2019-08-14 22:02:00 +00:00
|
|
|
}
|
2021-01-22 19:45:26 +00:00
|
|
|
|
2019-08-14 22:02:00 +00:00
|
|
|
return h
|
|
|
|
}
|
|
|
|
|
|
|
|
func (*groupServiceHook) Name() string {
|
|
|
|
return "group_services"
|
|
|
|
}
|
|
|
|
|
|
|
|
func (h *groupServiceHook) Prerun() error {
|
|
|
|
h.mu.Lock()
|
|
|
|
defer func() {
|
|
|
|
// Mark prerun as true to unblock Updates
|
|
|
|
h.prerun = true
|
|
|
|
h.mu.Unlock()
|
|
|
|
}()
|
2021-01-21 16:36:00 +00:00
|
|
|
return h.prerunLocked()
|
|
|
|
}
|
2019-11-18 18:04:01 +00:00
|
|
|
|
2021-01-21 16:36:00 +00:00
|
|
|
func (h *groupServiceHook) prerunLocked() error {
|
2019-11-18 18:04:01 +00:00
|
|
|
if len(h.services) == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
services := h.getWorkloadServices()
|
|
|
|
return h.consulClient.RegisterWorkload(services)
|
2019-08-14 22:02:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (h *groupServiceHook) Update(req *interfaces.RunnerUpdateRequest) error {
|
|
|
|
h.mu.Lock()
|
|
|
|
defer h.mu.Unlock()
|
client: enable configuring enable_tag_override for services
Consul provides a feature of Service Definitions where the tags
associated with a service can be modified through the Catalog API,
overriding the value(s) configured in the agent's service configuration.
To enable this feature, the flag enable_tag_override must be configured
in the service definition.
Previously, Nomad did not allow configuring this flag, and thus the default
value of false was used. Now, it is configurable.
Because Nomad itself acts as a state machine around the the service definitions
of the tasks it manages, it's worth describing what happens when this feature
is enabled and why.
Consider the basic case where there is no Nomad, and your service is provided
to consul as a boring JSON file. The ultimate source of truth for the definition
of that service is the file, and is stored in the agent. Later, Consul performs
"anti-entropy" which synchronizes the Catalog (stored only the leaders). Then
with enable_tag_override=true, the tags field is available for "external"
modification through the Catalog API (rather than directly configuring the
service definition file, or using the Agent API). The important observation
is that if the service definition ever changes (i.e. the file is changed &
config reloaded OR the Agent API is used to modify the service), those
"external" tag values are thrown away, and the new service definition is
once again the source of truth.
In the Nomad case, Nomad itself is the source of truth over the Agent in
the same way the JSON file was the source of truth in the example above.
That means any time Nomad sets a new service definition, any externally
configured tags are going to be replaced. When does this happen? Only on
major lifecycle events, for example when a task is modified because of an
updated job spec from the 'nomad job run <existing>' command. Otherwise,
Nomad's periodic re-sync's with Consul will now no longer try to restore
the externally modified tag values (as long as enable_tag_override=true).
Fixes #2057
2020-02-07 21:22:19 +00:00
|
|
|
|
2019-11-18 18:04:01 +00:00
|
|
|
oldWorkloadServices := h.getWorkloadServices()
|
|
|
|
|
|
|
|
// Store new updated values out of request
|
|
|
|
canary := false
|
|
|
|
if req.Alloc.DeploymentStatus != nil {
|
|
|
|
canary = req.Alloc.DeploymentStatus.Canary
|
|
|
|
}
|
|
|
|
|
|
|
|
var networks structs.Networks
|
|
|
|
if req.Alloc.AllocatedResources != nil {
|
|
|
|
networks = req.Alloc.AllocatedResources.Shared.Networks
|
2020-10-15 19:32:21 +00:00
|
|
|
h.ports = req.Alloc.AllocatedResources.Shared.Ports
|
2019-11-18 18:04:01 +00:00
|
|
|
}
|
|
|
|
|
2020-04-05 21:30:25 +00:00
|
|
|
tg := req.Alloc.Job.LookupTaskGroup(h.group)
|
|
|
|
var shutdown time.Duration
|
|
|
|
if tg.ShutdownDelay != nil {
|
|
|
|
shutdown = *tg.ShutdownDelay
|
|
|
|
}
|
2020-04-06 15:33:04 +00:00
|
|
|
|
|
|
|
// Update group service hook fields
|
|
|
|
h.networks = networks
|
|
|
|
h.services = tg.Services
|
|
|
|
h.canary = canary
|
2020-04-05 21:30:25 +00:00
|
|
|
h.delay = shutdown
|
2020-04-06 15:33:04 +00:00
|
|
|
h.taskEnvBuilder.UpdateTask(req.Alloc, nil)
|
2020-04-05 21:30:25 +00:00
|
|
|
|
2019-11-18 18:04:01 +00:00
|
|
|
// Create new task services struct with those new values
|
|
|
|
newWorkloadServices := h.getWorkloadServices()
|
2019-08-14 22:02:00 +00:00
|
|
|
|
|
|
|
if !h.prerun {
|
|
|
|
// Update called before Prerun. Update alloc and exit to allow
|
|
|
|
// Prerun to do initial registration.
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2019-11-18 18:04:01 +00:00
|
|
|
return h.consulClient.UpdateWorkload(oldWorkloadServices, newWorkloadServices)
|
2019-08-14 22:02:00 +00:00
|
|
|
}
|
|
|
|
|
2021-01-21 16:36:00 +00:00
|
|
|
func (h *groupServiceHook) PreTaskRestart() error {
|
|
|
|
h.mu.Lock()
|
|
|
|
defer func() {
|
|
|
|
// Mark prerun as true to unblock Updates
|
|
|
|
h.prerun = true
|
|
|
|
h.mu.Unlock()
|
|
|
|
}()
|
|
|
|
|
|
|
|
h.preKillLocked()
|
|
|
|
return h.prerunLocked()
|
|
|
|
}
|
|
|
|
|
2019-11-18 16:16:25 +00:00
|
|
|
func (h *groupServiceHook) PreKill() {
|
2019-08-14 22:02:00 +00:00
|
|
|
h.mu.Lock()
|
|
|
|
defer h.mu.Unlock()
|
2021-01-21 16:36:00 +00:00
|
|
|
h.preKillLocked()
|
|
|
|
}
|
2019-11-18 16:16:25 +00:00
|
|
|
|
2021-01-21 16:36:00 +00:00
|
|
|
// implements the PreKill hook but requires the caller hold the lock
|
|
|
|
func (h *groupServiceHook) preKillLocked() {
|
2019-11-18 16:16:25 +00:00
|
|
|
// If we have a shutdown delay deregister
|
|
|
|
// group services and then wait
|
|
|
|
// before continuing to kill tasks
|
2019-11-18 18:04:01 +00:00
|
|
|
h.deregister()
|
2019-11-18 16:16:25 +00:00
|
|
|
h.deregistered = true
|
|
|
|
|
2019-12-04 16:06:01 +00:00
|
|
|
if h.delay == 0 {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2019-11-18 16:16:25 +00:00
|
|
|
h.logger.Debug("waiting before removing group service", "shutdown_delay", h.delay)
|
2019-12-03 13:58:29 +00:00
|
|
|
|
|
|
|
// Wait for specified shutdown_delay
|
|
|
|
// this will block an agent from shutting down
|
|
|
|
<-time.After(h.delay)
|
2019-11-18 16:16:25 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (h *groupServiceHook) Postrun() error {
|
|
|
|
h.mu.Lock()
|
|
|
|
defer h.mu.Unlock()
|
|
|
|
|
|
|
|
if !h.deregistered {
|
|
|
|
h.deregister()
|
|
|
|
}
|
2019-11-18 18:04:01 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// deregister services from Consul.
|
|
|
|
func (h *groupServiceHook) deregister() {
|
|
|
|
if len(h.services) > 0 {
|
|
|
|
workloadServices := h.getWorkloadServices()
|
|
|
|
h.consulClient.RemoveWorkload(workloadServices)
|
|
|
|
|
|
|
|
// Canary flag may be getting flipped when the alloc is being
|
|
|
|
// destroyed, so remove both variations of the service
|
|
|
|
workloadServices.Canary = !workloadServices.Canary
|
|
|
|
h.consulClient.RemoveWorkload(workloadServices)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (h *groupServiceHook) getWorkloadServices() *agentconsul.WorkloadServices {
|
|
|
|
// Interpolate with the task's environment
|
|
|
|
interpolatedServices := taskenv.InterpolateServices(h.taskEnvBuilder.Build(), h.services)
|
|
|
|
|
2020-10-15 19:32:21 +00:00
|
|
|
var netStatus *structs.AllocNetworkStatus
|
|
|
|
if h.networkStatusGetter != nil {
|
|
|
|
netStatus = h.networkStatusGetter.NetworkStatus()
|
|
|
|
}
|
|
|
|
|
2019-11-18 18:04:01 +00:00
|
|
|
// Create task services struct with request's driver metadata
|
|
|
|
return &agentconsul.WorkloadServices{
|
|
|
|
AllocID: h.allocID,
|
|
|
|
Group: h.group,
|
|
|
|
Restarter: h.restarter,
|
|
|
|
Services: interpolatedServices,
|
|
|
|
Networks: h.networks,
|
2020-10-15 19:32:21 +00:00
|
|
|
NetworkStatus: netStatus,
|
|
|
|
Ports: h.ports,
|
2019-11-18 18:04:01 +00:00
|
|
|
Canary: h.canary,
|
|
|
|
}
|
2019-08-14 22:02:00 +00:00
|
|
|
}
|