open-nomad/client/allocrunner/group_service_hook.go
Seth Hoenig 753c17c9de
services: un-mark group services as deregistered if restart hook runs (#16905)
* services: un-mark group services as deregistered if restart hook runs

This PR may fix a bug where group services will never be deregistered if the
group undergoes a task restart.

* e2e: add test case for restart and deregister group service

* cl: add cl

* e2e: add wait for service list call
2023-04-24 14:24:51 -05:00

284 lines
7.5 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package allocrunner
import (
"context"
"sync"
"time"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
"github.com/hashicorp/nomad/client/serviceregistration"
"github.com/hashicorp/nomad/client/serviceregistration/wrapper"
"github.com/hashicorp/nomad/client/taskenv"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/nomad/structs"
)
const (
groupServiceHookName = "group_services"
)
// groupServiceHook manages task group Consul service registration and
// deregistration.
type groupServiceHook struct {
allocID string
jobID string
group string
namespace string
restarter serviceregistration.WorkloadRestarter
prerun bool
deregistered bool
networkStatus structs.NetworkStatus
shutdownDelayCtx context.Context
// providerNamespace is the Nomad or Consul namespace in which service
// registrations will be made. This field may be updated.
providerNamespace string
// serviceRegWrapper is the handler wrapper that is used to perform service
// and check registration and deregistration.
serviceRegWrapper *wrapper.HandlerWrapper
logger hclog.Logger
// The following fields may be updated
canary bool
services []*structs.Service
networks structs.Networks
ports structs.AllocatedPorts
taskEnvBuilder *taskenv.Builder
delay time.Duration
// Since Update() may be called concurrently with any other hook all
// hook methods must be fully serialized
mu sync.Mutex
}
type groupServiceHookConfig struct {
alloc *structs.Allocation
restarter serviceregistration.WorkloadRestarter
taskEnvBuilder *taskenv.Builder
networkStatus structs.NetworkStatus
shutdownDelayCtx context.Context
logger hclog.Logger
// providerNamespace is the Nomad or Consul namespace in which service
// registrations will be made.
providerNamespace string
// serviceRegWrapper is the handler wrapper that is used to perform service
// and check registration and deregistration.
serviceRegWrapper *wrapper.HandlerWrapper
}
func newGroupServiceHook(cfg groupServiceHookConfig) *groupServiceHook {
var shutdownDelay time.Duration
tg := cfg.alloc.Job.LookupTaskGroup(cfg.alloc.TaskGroup)
if tg.ShutdownDelay != nil {
shutdownDelay = *tg.ShutdownDelay
}
h := &groupServiceHook{
allocID: cfg.alloc.ID,
jobID: cfg.alloc.JobID,
group: cfg.alloc.TaskGroup,
namespace: cfg.alloc.Namespace,
restarter: cfg.restarter,
providerNamespace: cfg.providerNamespace,
taskEnvBuilder: cfg.taskEnvBuilder,
delay: shutdownDelay,
networkStatus: cfg.networkStatus,
logger: cfg.logger.Named(groupServiceHookName),
serviceRegWrapper: cfg.serviceRegWrapper,
services: tg.Services,
shutdownDelayCtx: cfg.shutdownDelayCtx,
}
if cfg.alloc.AllocatedResources != nil {
h.networks = cfg.alloc.AllocatedResources.Shared.Networks
h.ports = cfg.alloc.AllocatedResources.Shared.Ports
}
if cfg.alloc.DeploymentStatus != nil {
h.canary = cfg.alloc.DeploymentStatus.Canary
}
return h
}
func (*groupServiceHook) Name() string {
return groupServiceHookName
}
func (h *groupServiceHook) Prerun() error {
h.mu.Lock()
defer func() {
// Mark prerun as true to unblock Updates
h.prerun = true
// Mark deregistered as false to allow de-registration
h.deregistered = false
h.mu.Unlock()
}()
return h.preRunLocked()
}
// caller must hold h.mu
func (h *groupServiceHook) preRunLocked() error {
if len(h.services) == 0 {
return nil
}
services := h.getWorkloadServicesLocked()
return h.serviceRegWrapper.RegisterWorkload(services)
}
// Update is run when a job submitter modifies service(s) (but not much else -
// otherwise a full alloc replacement would occur).
func (h *groupServiceHook) Update(req *interfaces.RunnerUpdateRequest) error {
h.mu.Lock()
defer h.mu.Unlock()
oldWorkloadServices := h.getWorkloadServicesLocked()
// Store new updated values out of request
canary := false
if req.Alloc.DeploymentStatus != nil {
canary = req.Alloc.DeploymentStatus.Canary
}
var networks structs.Networks
if req.Alloc.AllocatedResources != nil {
networks = req.Alloc.AllocatedResources.Shared.Networks
h.ports = req.Alloc.AllocatedResources.Shared.Ports
}
tg := req.Alloc.Job.LookupTaskGroup(h.group)
var shutdown time.Duration
if tg.ShutdownDelay != nil {
shutdown = *tg.ShutdownDelay
}
// Update group service hook fields
h.networks = networks
h.services = tg.Services
h.canary = canary
h.delay = shutdown
h.taskEnvBuilder.UpdateTask(req.Alloc, nil)
// An update may change the service provider, therefore we need to account
// for how namespaces work across providers also.
h.providerNamespace = req.Alloc.ServiceProviderNamespace()
// Create new task services struct with those new values
newWorkloadServices := h.getWorkloadServicesLocked()
if !h.prerun {
// Update called before Prerun. Update alloc and exit to allow
// Prerun to do initial registration.
return nil
}
return h.serviceRegWrapper.UpdateWorkload(oldWorkloadServices, newWorkloadServices)
}
func (h *groupServiceHook) PreTaskRestart() error {
h.mu.Lock()
defer func() {
// Mark prerun as true to unblock Updates
h.prerun = true
// Mark deregistered as false to allow de-registration
h.deregistered = false
h.mu.Unlock()
}()
h.preKillLocked()
return h.preRunLocked()
}
func (h *groupServiceHook) PreKill() {
helper.WithLock(&h.mu, h.preKillLocked)
}
// implements the PreKill hook
//
// caller must hold h.mu
func (h *groupServiceHook) preKillLocked() {
// If we have a shutdown delay deregister group services and then wait
// before continuing to kill tasks.
h.deregisterLocked()
if h.delay == 0 {
return
}
h.logger.Debug("delay before killing tasks", "group", h.group, "shutdown_delay", h.delay)
timer, cancel := helper.NewSafeTimer(h.delay)
defer cancel()
select {
// Wait for specified shutdown_delay unless ignored
// This will block an agent from shutting down.
case <-timer.C:
case <-h.shutdownDelayCtx.Done():
}
}
func (h *groupServiceHook) Postrun() error {
helper.WithLock(&h.mu, h.deregisterLocked)
return nil
}
// deregisterLocked will deregister services from Consul/Nomad service provider.
//
// caller must hold h.lock
func (h *groupServiceHook) deregisterLocked() {
if h.deregistered {
return
}
if len(h.services) > 0 {
workloadServices := h.getWorkloadServicesLocked()
h.serviceRegWrapper.RemoveWorkload(workloadServices)
}
h.deregistered = true
}
// getWorkloadServicesLocked returns the set of workload services currently
// on the hook.
//
// caller must hold h.lock
func (h *groupServiceHook) getWorkloadServicesLocked() *serviceregistration.WorkloadServices {
// Interpolate with the task's environment
interpolatedServices := taskenv.InterpolateServices(h.taskEnvBuilder.Build(), h.services)
var netStatus *structs.AllocNetworkStatus
if h.networkStatus != nil {
netStatus = h.networkStatus.NetworkStatus()
}
info := structs.AllocInfo{
AllocID: h.allocID,
JobID: h.jobID,
Group: h.group,
Namespace: h.namespace,
}
// Create task services struct with request's driver metadata
return &serviceregistration.WorkloadServices{
AllocInfo: info,
ProviderNamespace: h.providerNamespace,
Restarter: h.restarter,
Services: interpolatedServices,
Networks: h.networks,
NetworkStatus: netStatus,
Ports: h.ports,
Canary: h.canary,
}
}