diff --git a/.changelog/13626.txt b/.changelog/13626.txt new file mode 100644 index 000000000..9c6f86701 --- /dev/null +++ b/.changelog/13626.txt @@ -0,0 +1,3 @@ +```release-note:bug +client: Fixed a bug where max_kill_timeout client config was ignored +``` diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go index 5ef9e5743..f8eaf8aa8 100644 --- a/client/allocrunner/alloc_runner.go +++ b/client/allocrunner/alloc_runner.go @@ -620,7 +620,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState { } taskEvent := structs.NewTaskEvent(structs.TaskKilling) - taskEvent.SetKillTimeout(tr.Task().KillTimeout) + taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout) err := tr.Kill(context.TODO(), taskEvent) if err != nil && err != taskrunner.ErrTaskNotRunning { ar.logger.Warn("error stopping leader task", "error", err, "task_name", name) @@ -643,7 +643,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState { go func(name string, tr *taskrunner.TaskRunner) { defer wg.Done() taskEvent := structs.NewTaskEvent(structs.TaskKilling) - taskEvent.SetKillTimeout(tr.Task().KillTimeout) + taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout) err := tr.Kill(context.TODO(), taskEvent) if err != nil && err != taskrunner.ErrTaskNotRunning { ar.logger.Warn("error stopping task", "error", err, "task_name", name) @@ -667,7 +667,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState { go func(name string, tr *taskrunner.TaskRunner) { defer wg.Done() taskEvent := structs.NewTaskEvent(structs.TaskKilling) - taskEvent.SetKillTimeout(tr.Task().KillTimeout) + taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout) err := tr.Kill(context.TODO(), taskEvent) if err != nil && err != taskrunner.ErrTaskNotRunning { ar.logger.Warn("error stopping sidecar task", "error", err, "task_name", name) diff --git a/client/allocrunner/alloc_runner_test.go b/client/allocrunner/alloc_runner_test.go index b28c87dd4..fafb33573 100644 --- a/client/allocrunner/alloc_runner_test.go +++ b/client/allocrunner/alloc_runner_test.go @@ -126,7 +126,7 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) { expectedKillingMsg := "Sent interrupt. Waiting 10ms before force killing" if killingMsg != expectedKillingMsg { - return false, fmt.Errorf("Unexpected task event message - wanted %q. got %q", killingMsg, expectedKillingMsg) + return false, fmt.Errorf("Unexpected task event message - wanted %q. got %q", expectedKillingMsg, killingMsg) } // Task Two should be dead diff --git a/client/allocrunner/taskrunner/driver_handle.go b/client/allocrunner/taskrunner/driver_handle.go index 36427f6f2..73a376c8a 100644 --- a/client/allocrunner/taskrunner/driver_handle.go +++ b/client/allocrunner/taskrunner/driver_handle.go @@ -6,18 +6,24 @@ import ( "time" cstructs "github.com/hashicorp/nomad/client/structs" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/plugins/drivers" ) // NewDriverHandle returns a handle for task operations on a specific task -func NewDriverHandle(driver drivers.DriverPlugin, taskID string, task *structs.Task, net *drivers.DriverNetwork) *DriverHandle { +func NewDriverHandle( + driver drivers.DriverPlugin, + taskID string, + task *structs.Task, + maxKillTimeout time.Duration, + net *drivers.DriverNetwork) *DriverHandle { return &DriverHandle{ driver: driver, net: net, taskID: taskID, killSignal: task.KillSignal, - killTimeout: task.KillTimeout, + killTimeout: helper.Min(task.KillTimeout, maxKillTimeout), } } diff --git a/client/allocrunner/taskrunner/remotetask_hook.go b/client/allocrunner/taskrunner/remotetask_hook.go index 2068b52d9..5a8ac03d1 100644 --- a/client/allocrunner/taskrunner/remotetask_hook.go +++ b/client/allocrunner/taskrunner/remotetask_hook.go @@ -72,7 +72,7 @@ func (h *remoteTaskHook) Prestart(ctx context.Context, req *interfaces.TaskPrest return nil } - h.tr.setDriverHandle(NewDriverHandle(h.tr.driver, th.Config.ID, h.tr.Task(), taskInfo.NetworkOverride)) + h.tr.setDriverHandle(NewDriverHandle(h.tr.driver, th.Config.ID, h.tr.Task(), h.tr.clientConfig.MaxKillTimeout, taskInfo.NetworkOverride)) h.tr.stateLock.Lock() h.tr.localState.TaskHandle = th diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go index f5342962b..95c87440e 100644 --- a/client/allocrunner/taskrunner/task_runner.go +++ b/client/allocrunner/taskrunner/task_runner.go @@ -883,7 +883,7 @@ func (tr *TaskRunner) runDriver() error { } tr.stateLock.Unlock() - tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), net)) + tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), tr.clientConfig.MaxKillTimeout, net)) // Emit an event that we started tr.UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted)) @@ -1182,7 +1182,7 @@ func (tr *TaskRunner) restoreHandle(taskHandle *drivers.TaskHandle, net *drivers } // Update driver handle on task runner - tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), net)) + tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), tr.clientConfig.MaxKillTimeout, net)) return true } diff --git a/client/config/testing.go b/client/config/testing.go index a326cfbd6..0204073fb 100644 --- a/client/config/testing.go +++ b/client/config/testing.go @@ -4,6 +4,7 @@ import ( "io/ioutil" "os" "path/filepath" + "time" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/helper" @@ -64,5 +65,9 @@ func TestClientConfig(t testing.T) (*Config, func()) { // Loosen GC threshold conf.GCDiskUsageThreshold = 98.0 conf.GCInodeUsageThreshold = 98.0 + + // Same as default; necessary for task Event messages + conf.MaxKillTimeout = 30 * time.Second + return conf, cleanup } diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 1304ddd43..d7748d95e 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -6976,7 +6976,7 @@ type Task struct { // task exits, other tasks will be gracefully terminated. Leader bool - // ShutdownDelay is the duration of the delay between deregistering a + // ShutdownDelay is the duration of the delay between de-registering a // task from Consul and sending it a signal to shutdown. See #2441 ShutdownDelay time.Duration @@ -8379,9 +8379,10 @@ func (e *TaskEvent) SetValidationError(err error) *TaskEvent { return e } -func (e *TaskEvent) SetKillTimeout(timeout time.Duration) *TaskEvent { - e.KillTimeout = timeout - e.Details["kill_timeout"] = timeout.String() +func (e *TaskEvent) SetKillTimeout(timeout, maxTimeout time.Duration) *TaskEvent { + actual := helper.Min(timeout, maxTimeout) + e.KillTimeout = actual + e.Details["kill_timeout"] = actual.String() return e } diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go index cb155d433..376501670 100644 --- a/nomad/structs/structs_test.go +++ b/nomad/structs/structs_test.go @@ -6137,7 +6137,8 @@ func TestTaskEventPopulate(t *testing.T) { {NewTaskEvent(TaskRestarting).SetRestartReason("Chaos Monkey did it"), "Chaos Monkey did it - Task restarting in 0s"}, {NewTaskEvent(TaskKilling), "Sent interrupt"}, {NewTaskEvent(TaskKilling).SetKillReason("Its time for you to die"), "Its time for you to die"}, - {NewTaskEvent(TaskKilling).SetKillTimeout(1 * time.Second), "Sent interrupt. Waiting 1s before force killing"}, + {NewTaskEvent(TaskKilling).SetKillTimeout(1*time.Second, 5*time.Second), "Sent interrupt. Waiting 1s before force killing"}, + {NewTaskEvent(TaskKilling).SetKillTimeout(10*time.Second, 5*time.Second), "Sent interrupt. Waiting 5s before force killing"}, {NewTaskEvent(TaskTerminated).SetExitCode(-1).SetSignal(3), "Exit Code: -1, Signal: 3"}, {NewTaskEvent(TaskTerminated).SetMessage("Goodbye"), "Exit Code: 0, Exit Message: \"Goodbye\""}, {NewTaskEvent(TaskKilled), "Task successfully killed"},