07543f8bdf
* services: always set deregister flag after deregistration of group This PR fixes a bug where the group service hook's deregister flag was not set in some cases, causing the hook to attempt deregistrations twice during job updates (alloc replacement). In the tests ... we used to assert on the wrong behvior (remove twice) which has now been corrected to assert we remove only once. This bug was "silent" in the Consul provider world because the error logs for double deregistration only show up in Consul logs; with the Nomad provider the error logs are in the Nomad agent logs. * services: cleanup group service hook tests
277 lines
7.3 KiB
Go
277 lines
7.3 KiB
Go
package allocrunner
|
|
|
|
import (
|
|
"context"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/hashicorp/go-hclog"
|
|
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
|
|
"github.com/hashicorp/nomad/client/serviceregistration"
|
|
"github.com/hashicorp/nomad/client/serviceregistration/wrapper"
|
|
"github.com/hashicorp/nomad/client/taskenv"
|
|
"github.com/hashicorp/nomad/helper"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
)
|
|
|
|
const (
|
|
groupServiceHookName = "group_services"
|
|
)
|
|
|
|
// groupServiceHook manages task group Consul service registration and
|
|
// deregistration.
|
|
type groupServiceHook struct {
|
|
allocID string
|
|
jobID string
|
|
group string
|
|
namespace string
|
|
restarter serviceregistration.WorkloadRestarter
|
|
prerun bool
|
|
deregistered bool
|
|
networkStatus structs.NetworkStatus
|
|
shutdownDelayCtx context.Context
|
|
|
|
// providerNamespace is the Nomad or Consul namespace in which service
|
|
// registrations will be made. This field may be updated.
|
|
providerNamespace string
|
|
|
|
// serviceRegWrapper is the handler wrapper that is used to perform service
|
|
// and check registration and deregistration.
|
|
serviceRegWrapper *wrapper.HandlerWrapper
|
|
|
|
logger hclog.Logger
|
|
|
|
// The following fields may be updated
|
|
canary bool
|
|
services []*structs.Service
|
|
networks structs.Networks
|
|
ports structs.AllocatedPorts
|
|
taskEnvBuilder *taskenv.Builder
|
|
delay time.Duration
|
|
|
|
// Since Update() may be called concurrently with any other hook all
|
|
// hook methods must be fully serialized
|
|
mu sync.Mutex
|
|
}
|
|
|
|
type groupServiceHookConfig struct {
|
|
alloc *structs.Allocation
|
|
restarter serviceregistration.WorkloadRestarter
|
|
taskEnvBuilder *taskenv.Builder
|
|
networkStatus structs.NetworkStatus
|
|
shutdownDelayCtx context.Context
|
|
logger hclog.Logger
|
|
|
|
// providerNamespace is the Nomad or Consul namespace in which service
|
|
// registrations will be made.
|
|
providerNamespace string
|
|
|
|
// serviceRegWrapper is the handler wrapper that is used to perform service
|
|
// and check registration and deregistration.
|
|
serviceRegWrapper *wrapper.HandlerWrapper
|
|
}
|
|
|
|
func newGroupServiceHook(cfg groupServiceHookConfig) *groupServiceHook {
|
|
var shutdownDelay time.Duration
|
|
tg := cfg.alloc.Job.LookupTaskGroup(cfg.alloc.TaskGroup)
|
|
|
|
if tg.ShutdownDelay != nil {
|
|
shutdownDelay = *tg.ShutdownDelay
|
|
}
|
|
|
|
h := &groupServiceHook{
|
|
allocID: cfg.alloc.ID,
|
|
jobID: cfg.alloc.JobID,
|
|
group: cfg.alloc.TaskGroup,
|
|
namespace: cfg.alloc.Namespace,
|
|
restarter: cfg.restarter,
|
|
providerNamespace: cfg.providerNamespace,
|
|
taskEnvBuilder: cfg.taskEnvBuilder,
|
|
delay: shutdownDelay,
|
|
networkStatus: cfg.networkStatus,
|
|
logger: cfg.logger.Named(groupServiceHookName),
|
|
serviceRegWrapper: cfg.serviceRegWrapper,
|
|
services: tg.Services,
|
|
shutdownDelayCtx: cfg.shutdownDelayCtx,
|
|
}
|
|
|
|
if cfg.alloc.AllocatedResources != nil {
|
|
h.networks = cfg.alloc.AllocatedResources.Shared.Networks
|
|
h.ports = cfg.alloc.AllocatedResources.Shared.Ports
|
|
}
|
|
|
|
if cfg.alloc.DeploymentStatus != nil {
|
|
h.canary = cfg.alloc.DeploymentStatus.Canary
|
|
}
|
|
|
|
return h
|
|
}
|
|
|
|
func (*groupServiceHook) Name() string {
|
|
return groupServiceHookName
|
|
}
|
|
|
|
func (h *groupServiceHook) Prerun() error {
|
|
h.mu.Lock()
|
|
defer func() {
|
|
// Mark prerun as true to unblock Updates
|
|
h.prerun = true
|
|
h.mu.Unlock()
|
|
}()
|
|
return h.preRunLocked()
|
|
}
|
|
|
|
// caller must hold h.lock
|
|
func (h *groupServiceHook) preRunLocked() error {
|
|
if len(h.services) == 0 {
|
|
return nil
|
|
}
|
|
|
|
services := h.getWorkloadServicesLocked()
|
|
return h.serviceRegWrapper.RegisterWorkload(services)
|
|
}
|
|
|
|
// Update is run when a job submitter modifies service(s) (but not much else -
|
|
// otherwise a full alloc replacement would occur).
|
|
func (h *groupServiceHook) Update(req *interfaces.RunnerUpdateRequest) error {
|
|
h.mu.Lock()
|
|
defer h.mu.Unlock()
|
|
|
|
oldWorkloadServices := h.getWorkloadServicesLocked()
|
|
|
|
// Store new updated values out of request
|
|
canary := false
|
|
if req.Alloc.DeploymentStatus != nil {
|
|
canary = req.Alloc.DeploymentStatus.Canary
|
|
}
|
|
|
|
var networks structs.Networks
|
|
if req.Alloc.AllocatedResources != nil {
|
|
networks = req.Alloc.AllocatedResources.Shared.Networks
|
|
h.ports = req.Alloc.AllocatedResources.Shared.Ports
|
|
}
|
|
|
|
tg := req.Alloc.Job.LookupTaskGroup(h.group)
|
|
var shutdown time.Duration
|
|
if tg.ShutdownDelay != nil {
|
|
shutdown = *tg.ShutdownDelay
|
|
}
|
|
|
|
// Update group service hook fields
|
|
h.networks = networks
|
|
h.services = tg.Services
|
|
h.canary = canary
|
|
h.delay = shutdown
|
|
h.taskEnvBuilder.UpdateTask(req.Alloc, nil)
|
|
|
|
// An update may change the service provider, therefore we need to account
|
|
// for how namespaces work across providers also.
|
|
h.providerNamespace = req.Alloc.ServiceProviderNamespace()
|
|
|
|
// Create new task services struct with those new values
|
|
newWorkloadServices := h.getWorkloadServicesLocked()
|
|
|
|
if !h.prerun {
|
|
// Update called before Prerun. Update alloc and exit to allow
|
|
// Prerun to do initial registration.
|
|
return nil
|
|
}
|
|
|
|
return h.serviceRegWrapper.UpdateWorkload(oldWorkloadServices, newWorkloadServices)
|
|
}
|
|
|
|
func (h *groupServiceHook) PreTaskRestart() error {
|
|
h.mu.Lock()
|
|
defer func() {
|
|
// Mark prerun as true to unblock Updates
|
|
h.prerun = true
|
|
h.mu.Unlock()
|
|
}()
|
|
|
|
h.preKillLocked()
|
|
return h.preRunLocked()
|
|
}
|
|
|
|
func (h *groupServiceHook) PreKill() {
|
|
helper.WithLock(&h.mu, h.preKillLocked)
|
|
}
|
|
|
|
// implements the PreKill hook
|
|
//
|
|
// caller must hold h.lock
|
|
func (h *groupServiceHook) preKillLocked() {
|
|
// If we have a shutdown delay deregister group services and then wait
|
|
// before continuing to kill tasks.
|
|
h.deregisterLocked()
|
|
|
|
if h.delay == 0 {
|
|
return
|
|
}
|
|
|
|
h.logger.Debug("delay before killing tasks", "group", h.group, "shutdown_delay", h.delay)
|
|
|
|
timer, cancel := helper.NewSafeTimer(h.delay)
|
|
defer cancel()
|
|
|
|
select {
|
|
// Wait for specified shutdown_delay unless ignored
|
|
// This will block an agent from shutting down.
|
|
case <-timer.C:
|
|
case <-h.shutdownDelayCtx.Done():
|
|
}
|
|
}
|
|
|
|
func (h *groupServiceHook) Postrun() error {
|
|
helper.WithLock(&h.mu, h.deregisterLocked)
|
|
return nil
|
|
}
|
|
|
|
// deregisterLocked will deregister services from Consul/Nomad service provider.
|
|
//
|
|
// caller must hold h.lock
|
|
func (h *groupServiceHook) deregisterLocked() {
|
|
if h.deregistered {
|
|
return
|
|
}
|
|
|
|
if len(h.services) > 0 {
|
|
workloadServices := h.getWorkloadServicesLocked()
|
|
h.serviceRegWrapper.RemoveWorkload(workloadServices)
|
|
}
|
|
|
|
h.deregistered = true
|
|
}
|
|
|
|
// getWorkloadServicesLocked returns the set of workload services currently
|
|
// on the hook.
|
|
//
|
|
// caller must hold h.lock
|
|
func (h *groupServiceHook) getWorkloadServicesLocked() *serviceregistration.WorkloadServices {
|
|
// Interpolate with the task's environment
|
|
interpolatedServices := taskenv.InterpolateServices(h.taskEnvBuilder.Build(), h.services)
|
|
|
|
var netStatus *structs.AllocNetworkStatus
|
|
if h.networkStatus != nil {
|
|
netStatus = h.networkStatus.NetworkStatus()
|
|
}
|
|
|
|
info := structs.AllocInfo{
|
|
AllocID: h.allocID,
|
|
JobID: h.jobID,
|
|
Group: h.group,
|
|
Namespace: h.namespace,
|
|
}
|
|
|
|
// Create task services struct with request's driver metadata
|
|
return &serviceregistration.WorkloadServices{
|
|
AllocInfo: info,
|
|
ProviderNamespace: h.providerNamespace,
|
|
Restarter: h.restarter,
|
|
Services: interpolatedServices,
|
|
Networks: h.networks,
|
|
NetworkStatus: netStatus,
|
|
Ports: h.ports,
|
|
Canary: h.canary,
|
|
}
|
|
}
|