open-nomad/client/allocrunner/taskrunner/lifecycle.go
Mahmood Ali 067fd86a8c
drivers: Capture exit code when task is killed (#10494)
This commit ensures Nomad captures the task code more reliably even when the task is killed. This issue affect to `raw_exec` driver, as noted in https://github.com/hashicorp/nomad/issues/10430 .

We fix this issue by ensuring that the TaskRunner only calls `driver.WaitTask` once. The TaskRunner monitors the completion of the task by calling `driver.WaitTask` which should return the task exit code on completion. However, it also could return a "context canceled" error if the agent/executor is shutdown.

Previously, when a task is to be stopped, the killTask path makes two WaitTask calls, and the second returns "context canceled" occasionally because of a "race" in task shutting down and depending on driver, and how fast it shuts down after task completes.

By having a single WaitTask call and consistently waiting for the task, we ensure we capture the exit code reliably before the executor is shutdown or the contexts expired.

I opted to change the TaskRunner implementation to avoid changing the driver interface or requiring 3rd party drivers to update.

Additionally, the PR ensures that attempts to kill the task terminate when the task "naturally" dies. Without this change, if the task dies at the right moment, the `killTask` call may retry to kill an already-dead task for up to 5 minutes before giving up.
2021-05-04 10:54:00 -04:00

93 lines
2.2 KiB
Go

package taskrunner
import (
"context"
"github.com/hashicorp/nomad/nomad/structs"
)
// Restart a task. Returns immediately if no task is running. Blocks until
// existing task exits or passed-in context is canceled.
func (tr *TaskRunner) Restart(ctx context.Context, event *structs.TaskEvent, failure bool) error {
tr.logger.Trace("Restart requested", "failure", failure)
// Grab the handle
handle := tr.getDriverHandle()
// Check it is running
if handle == nil {
return ErrTaskNotRunning
}
// Emit the event since it may take a long time to kill
tr.EmitEvent(event)
// Run the pre-kill hooks prior to restarting the task
tr.preKill()
// Tell the restart tracker that a restart triggered the exit
tr.restartTracker.SetRestartTriggered(failure)
// Grab a handle to the wait channel that will timeout with context cancelation
// _before_ killing the task.
waitCh, err := handle.WaitCh(ctx)
if err != nil {
return err
}
// Kill the task using an exponential backoff in-case of failures.
if _, err := tr.killTask(handle, waitCh); err != nil {
// We couldn't successfully destroy the resource created.
tr.logger.Error("failed to kill task. Resources may have been leaked", "error", err)
}
select {
case <-waitCh:
case <-ctx.Done():
}
return nil
}
func (tr *TaskRunner) Signal(event *structs.TaskEvent, s string) error {
tr.logger.Trace("Signal requested", "signal", s)
// Grab the handle
handle := tr.getDriverHandle()
// Check it is running
if handle == nil {
return ErrTaskNotRunning
}
// Emit the event
tr.EmitEvent(event)
// Send the signal
return handle.Signal(s)
}
// Kill a task. Blocks until task exits or context is canceled. State is set to
// dead.
func (tr *TaskRunner) Kill(ctx context.Context, event *structs.TaskEvent) error {
tr.logger.Trace("Kill requested", "event_type", event.Type, "event_reason", event.KillReason)
// Cancel the task runner to break out of restart delay or the main run
// loop.
tr.killCtxCancel()
// Emit kill event
tr.EmitEvent(event)
select {
case <-tr.WaitCh():
case <-ctx.Done():
return ctx.Err()
}
return tr.getKillErr()
}
func (tr *TaskRunner) IsRunning() bool {
return tr.getDriverHandle() != nil
}