open-nomad/client/allocrunner/taskrunner/remotetask_hook.go

package taskrunner

import (
	"context"

	hclog "github.com/hashicorp/go-hclog"
	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
	"github.com/hashicorp/nomad/nomad/structs"
	"github.com/hashicorp/nomad/plugins/drivers"
)

var _ interfaces.TaskPrestartHook = (*remoteTaskHook)(nil)
var _ interfaces.TaskPreKillHook = (*remoteTaskHook)(nil)

// remoteTaskHook reattaches to remotely executing tasks.
type remoteTaskHook struct {
	tr *TaskRunner

	logger hclog.Logger
}

func newRemoteTaskHook(tr *TaskRunner, logger hclog.Logger) interfaces.TaskHook {
	h := &remoteTaskHook{
		tr: tr,
	}
	h.logger = logger.Named(h.Name())
	return h
}

func (h *remoteTaskHook) Name() string {
	return "remote_task"
}

// Prestart performs 2 remote task driver related tasks:
//  1. If there is no local handle, see if there is a handle propagated from a
//     previous alloc to be restored.
//  2. If the alloc is lost make sure the task signal is set to detach instead
//     of kill.
func (h *remoteTaskHook) Prestart(ctx context.Context, req *interfaces.TaskPrestartRequest, resp *interfaces.TaskPrestartResponse) error {
	if h.tr.getDriverHandle() != nil {
		// Driver handle already exists so don't try to load remote
		// task handle
		return nil
	}

	h.tr.stateLock.Lock()
	th := drivers.NewTaskHandleFromState(h.tr.state)
	h.tr.stateLock.Unlock()

	// Task handle will be nil if there was no previous allocation or if
	// this is a destructive update
	if th == nil {
		resp.Done = true
		return nil
	}

	// The task config is unique per invocation so recreate it here
	th.Config = h.tr.buildTaskConfig()

	if err := h.tr.driver.RecoverTask(th); err != nil {
		// Soft error here to let a new instance get started instead of
		// failing the task since retrying is unlikely to help.
		h.logger.Error("error recovering task state", "error", err)
		return nil
	}

	taskInfo, err := h.tr.driver.InspectTask(th.Config.ID)
	if err != nil {
		// Soft error here to let a new instance get started instead of
		// failing the task since retrying is unlikely to help.
		h.logger.Error("error inspecting recovered task state", "error", err)
		return nil
	}

	h.tr.setDriverHandle(NewDriverHandle(h.tr.driver, th.Config.ID, h.tr.Task(), h.tr.clientConfig.MaxKillTimeout, taskInfo.NetworkOverride))

	h.tr.stateLock.Lock()
	h.tr.localState.TaskHandle = th
	h.tr.localState.DriverNetwork = taskInfo.NetworkOverride
	h.tr.stateLock.Unlock()

	// Ensure the signal is set according to the allocation's state
	h.setSignal(h.tr.Alloc())

	// Emit TaskStarted manually since the normal task runner logic will
	// treat this task like a restored task and skip emitting started.
	h.tr.UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted))

	return nil
}

// PreKilling tells the remote task driver to detach a remote task instead of
// stopping it.
func (h *remoteTaskHook) PreKilling(ctx context.Context, req *interfaces.TaskPreKillRequest, resp *interfaces.TaskPreKillResponse) error {
	alloc := h.tr.Alloc()
	h.setSignal(alloc)
	return nil
}

// setSignal to detach if the allocation is lost or draining. Safe to call
// multiple times as it only transitions to using detach -- never back to kill.
func (h *remoteTaskHook) setSignal(alloc *structs.Allocation) {
	driverHandle := h.tr.getDriverHandle()
	if driverHandle == nil {
		// Nothing to do exit early
		return
	}

	switch {
	case alloc.ClientStatus == structs.AllocClientStatusLost:
		// Continue on; lost allocs should just detach
		h.logger.Debug("detaching from remote task since alloc was lost")
	case alloc.DesiredTransition.ShouldMigrate():
		// Continue on; migrating allocs should just detach
		h.logger.Debug("detaching from remote task since alloc was drained")
	default:
		// Nothing to do exit early
		return
	}

	// Set DetachSignal to indicate to the remote task driver that it
	// should detach this remote task and ignore it.
	driverHandle.SetKillSignal(drivers.DetachSignal)
}