open-nomad/client/allocrunner/groupservice_hook.go
Seth Hoenig 0e44094d1a client: enable configuring enable_tag_override for services
Consul provides a feature of Service Definitions where the tags
associated with a service can be modified through the Catalog API,
overriding the value(s) configured in the agent's service configuration.

To enable this feature, the flag enable_tag_override must be configured
in the service definition.

Previously, Nomad did not allow configuring this flag, and thus the default
value of false was used. Now, it is configurable.

Because Nomad itself acts as a state machine around the the service definitions
of the tasks it manages, it's worth describing what happens when this feature
is enabled and why.

Consider the basic case where there is no Nomad, and your service is provided
to consul as a boring JSON file. The ultimate source of truth for the definition
of that service is the file, and is stored in the agent. Later, Consul performs
"anti-entropy" which synchronizes the Catalog (stored only the leaders). Then
with enable_tag_override=true, the tags field is available for "external"
modification through the Catalog API (rather than directly configuring the
service definition file, or using the Agent API). The important observation
is that if the service definition ever changes (i.e. the file is changed &
config reloaded OR the Agent API is used to modify the service), those
"external" tag values are thrown away, and the new service definition is
once again the source of truth.

In the Nomad case, Nomad itself is the source of truth over the Agent in
the same way the JSON file was the source of truth in the example above.
That means any time Nomad sets a new service definition, any externally
configured tags are going to be replaced. When does this happen? Only on
major lifecycle events, for example when a task is modified because of an
updated job spec from the 'nomad job run <existing>' command. Otherwise,
Nomad's periodic re-sync's with Consul will now no longer try to restore
the externally modified tag values (as long as enable_tag_override=true).

Fixes #2057
2020-02-10 08:00:55 -06:00

207 lines
5.2 KiB
Go

package allocrunner
import (
"sync"
"time"
log "github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
"github.com/hashicorp/nomad/client/consul"
"github.com/hashicorp/nomad/client/taskenv"
agentconsul "github.com/hashicorp/nomad/command/agent/consul"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/plugins/drivers"
)
// groupServiceHook manages task group Consul service registration and
// deregistration.
type groupServiceHook struct {
allocID string
group string
restarter agentconsul.WorkloadRestarter
consulClient consul.ConsulServiceAPI
prerun bool
delay time.Duration
deregistered bool
logger log.Logger
// The following fields may be updated
canary bool
services []*structs.Service
networks structs.Networks
taskEnvBuilder *taskenv.Builder
// Since Update() may be called concurrently with any other hook all
// hook methods must be fully serialized
mu sync.Mutex
}
type groupServiceHookConfig struct {
alloc *structs.Allocation
consul consul.ConsulServiceAPI
restarter agentconsul.WorkloadRestarter
taskEnvBuilder *taskenv.Builder
logger log.Logger
}
func newGroupServiceHook(cfg groupServiceHookConfig) *groupServiceHook {
var shutdownDelay time.Duration
tg := cfg.alloc.Job.LookupTaskGroup(cfg.alloc.TaskGroup)
if tg.ShutdownDelay != nil {
shutdownDelay = *tg.ShutdownDelay
}
h := &groupServiceHook{
allocID: cfg.alloc.ID,
group: cfg.alloc.TaskGroup,
restarter: cfg.restarter,
consulClient: cfg.consul,
taskEnvBuilder: cfg.taskEnvBuilder,
delay: shutdownDelay,
}
h.logger = cfg.logger.Named(h.Name())
h.services = cfg.alloc.Job.LookupTaskGroup(h.group).Services
if cfg.alloc.AllocatedResources != nil {
h.networks = cfg.alloc.AllocatedResources.Shared.Networks
}
if cfg.alloc.DeploymentStatus != nil {
h.canary = cfg.alloc.DeploymentStatus.Canary
}
return h
}
func (*groupServiceHook) Name() string {
return "group_services"
}
func (h *groupServiceHook) Prerun() error {
h.mu.Lock()
defer func() {
// Mark prerun as true to unblock Updates
h.prerun = true
h.mu.Unlock()
}()
if len(h.services) == 0 {
return nil
}
services := h.getWorkloadServices()
return h.consulClient.RegisterWorkload(services)
}
func (h *groupServiceHook) Update(req *interfaces.RunnerUpdateRequest) error {
h.mu.Lock()
defer h.mu.Unlock()
oldWorkloadServices := h.getWorkloadServices()
// Store new updated values out of request
canary := false
if req.Alloc.DeploymentStatus != nil {
canary = req.Alloc.DeploymentStatus.Canary
}
var networks structs.Networks
if req.Alloc.AllocatedResources != nil {
networks = req.Alloc.AllocatedResources.Shared.Networks
}
// Update group service hook fields
h.networks = networks
h.services = req.Alloc.Job.LookupTaskGroup(h.group).Services
h.canary = canary
h.taskEnvBuilder.UpdateTask(req.Alloc, nil)
// Create new task services struct with those new values
newWorkloadServices := h.getWorkloadServices()
if !h.prerun {
// Update called before Prerun. Update alloc and exit to allow
// Prerun to do initial registration.
return nil
}
return h.consulClient.UpdateWorkload(oldWorkloadServices, newWorkloadServices)
}
func (h *groupServiceHook) PreKill() {
h.mu.Lock()
defer h.mu.Unlock()
// If we have a shutdown delay deregister
// group services and then wait
// before continuing to kill tasks
h.deregister()
h.deregistered = true
if h.delay == 0 {
return
}
h.logger.Debug("waiting before removing group service", "shutdown_delay", h.delay)
// Wait for specified shutdown_delay
// this will block an agent from shutting down
<-time.After(h.delay)
}
func (h *groupServiceHook) Postrun() error {
h.mu.Lock()
defer h.mu.Unlock()
if !h.deregistered {
h.deregister()
}
return nil
}
func (h *groupServiceHook) driverNet() *drivers.DriverNetwork {
if len(h.networks) == 0 {
return nil
}
//TODO(schmichael) only support one network for now
net := h.networks[0]
//TODO(schmichael) there's probably a better way than hacking driver network
return &drivers.DriverNetwork{
AutoAdvertise: true,
IP: net.IP,
// Copy PortLabels from group network
PortMap: net.PortLabels(),
}
}
// deregister services from Consul.
func (h *groupServiceHook) deregister() {
if len(h.services) > 0 {
workloadServices := h.getWorkloadServices()
h.consulClient.RemoveWorkload(workloadServices)
// Canary flag may be getting flipped when the alloc is being
// destroyed, so remove both variations of the service
workloadServices.Canary = !workloadServices.Canary
h.consulClient.RemoveWorkload(workloadServices)
}
}
func (h *groupServiceHook) getWorkloadServices() *agentconsul.WorkloadServices {
// Interpolate with the task's environment
interpolatedServices := taskenv.InterpolateServices(h.taskEnvBuilder.Build(), h.services)
// Create task services struct with request's driver metadata
return &agentconsul.WorkloadServices{
AllocID: h.allocID,
Group: h.group,
Restarter: h.restarter,
Services: interpolatedServices,
DriverNetwork: h.driverNet(),
Networks: h.networks,
Canary: h.canary,
}
}