From 5dd8aa3e27a7ad4130156532ed2b3bae0348db92 Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Wed, 6 Jul 2022 14:44:58 -0500 Subject: [PATCH] client: enforce max_kill_timeout client configuration This PR fixes a bug where client configuration max_kill_timeout was not being enforced. The feature was introduced in 9f44780 but seems to have been removed during the major drivers refactoring. We can make sure the value is enforced by pluming it through the DriverHandler, which now uses the lesser of the task.killTimeout or client.maxKillTimeout. Also updates Event.SetKillTimeout to require both the task.killTimeout and client.maxKillTimeout so that we don't make the mistake of using the wrong value - as it was being given only the task.killTimeout before. --- .changelog/13626.txt | 3 +++ client/allocrunner/alloc_runner.go | 6 +++--- client/allocrunner/alloc_runner_test.go | 2 +- client/allocrunner/taskrunner/driver_handle.go | 10 ++++++++-- client/allocrunner/taskrunner/remotetask_hook.go | 2 +- client/allocrunner/taskrunner/task_runner.go | 4 ++-- client/config/testing.go | 5 +++++ nomad/structs/structs.go | 9 +++++---- nomad/structs/structs_test.go | 3 ++- 9 files changed, 30 insertions(+), 14 deletions(-) create mode 100644 .changelog/13626.txt diff --git a/.changelog/13626.txt b/.changelog/13626.txt new file mode 100644 index 000000000..9c6f86701 --- /dev/null +++ b/.changelog/13626.txt @@ -0,0 +1,3 @@ +```release-note:bug +client: Fixed a bug where max_kill_timeout client config was ignored +``` diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go index 5ef9e5743..f8eaf8aa8 100644 --- a/client/allocrunner/alloc_runner.go +++ b/client/allocrunner/alloc_runner.go @@ -620,7 +620,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState { } taskEvent := structs.NewTaskEvent(structs.TaskKilling) - taskEvent.SetKillTimeout(tr.Task().KillTimeout) + taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout) err := tr.Kill(context.TODO(), taskEvent) if err != nil && err != taskrunner.ErrTaskNotRunning { ar.logger.Warn("error stopping leader task", "error", err, "task_name", name) @@ -643,7 +643,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState { go func(name string, tr *taskrunner.TaskRunner) { defer wg.Done() taskEvent := structs.NewTaskEvent(structs.TaskKilling) - taskEvent.SetKillTimeout(tr.Task().KillTimeout) + taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout) err := tr.Kill(context.TODO(), taskEvent) if err != nil && err != taskrunner.ErrTaskNotRunning { ar.logger.Warn("error stopping task", "error", err, "task_name", name) @@ -667,7 +667,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState { go func(name string, tr *taskrunner.TaskRunner) { defer wg.Done() taskEvent := structs.NewTaskEvent(structs.TaskKilling) - taskEvent.SetKillTimeout(tr.Task().KillTimeout) + taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout) err := tr.Kill(context.TODO(), taskEvent) if err != nil && err != taskrunner.ErrTaskNotRunning { ar.logger.Warn("error stopping sidecar task", "error", err, "task_name", name) diff --git a/client/allocrunner/alloc_runner_test.go b/client/allocrunner/alloc_runner_test.go index b28c87dd4..fafb33573 100644 --- a/client/allocrunner/alloc_runner_test.go +++ b/client/allocrunner/alloc_runner_test.go @@ -126,7 +126,7 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) { expectedKillingMsg := "Sent interrupt. Waiting 10ms before force killing" if killingMsg != expectedKillingMsg { - return false, fmt.Errorf("Unexpected task event message - wanted %q. got %q", killingMsg, expectedKillingMsg) + return false, fmt.Errorf("Unexpected task event message - wanted %q. got %q", expectedKillingMsg, killingMsg) } // Task Two should be dead diff --git a/client/allocrunner/taskrunner/driver_handle.go b/client/allocrunner/taskrunner/driver_handle.go index 36427f6f2..73a376c8a 100644 --- a/client/allocrunner/taskrunner/driver_handle.go +++ b/client/allocrunner/taskrunner/driver_handle.go @@ -6,18 +6,24 @@ import ( "time" cstructs "github.com/hashicorp/nomad/client/structs" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/plugins/drivers" ) // NewDriverHandle returns a handle for task operations on a specific task -func NewDriverHandle(driver drivers.DriverPlugin, taskID string, task *structs.Task, net *drivers.DriverNetwork) *DriverHandle { +func NewDriverHandle( + driver drivers.DriverPlugin, + taskID string, + task *structs.Task, + maxKillTimeout time.Duration, + net *drivers.DriverNetwork) *DriverHandle { return &DriverHandle{ driver: driver, net: net, taskID: taskID, killSignal: task.KillSignal, - killTimeout: task.KillTimeout, + killTimeout: helper.Min(task.KillTimeout, maxKillTimeout), } } diff --git a/client/allocrunner/taskrunner/remotetask_hook.go b/client/allocrunner/taskrunner/remotetask_hook.go index 2068b52d9..5a8ac03d1 100644 --- a/client/allocrunner/taskrunner/remotetask_hook.go +++ b/client/allocrunner/taskrunner/remotetask_hook.go @@ -72,7 +72,7 @@ func (h *remoteTaskHook) Prestart(ctx context.Context, req *interfaces.TaskPrest return nil } - h.tr.setDriverHandle(NewDriverHandle(h.tr.driver, th.Config.ID, h.tr.Task(), taskInfo.NetworkOverride)) + h.tr.setDriverHandle(NewDriverHandle(h.tr.driver, th.Config.ID, h.tr.Task(), h.tr.clientConfig.MaxKillTimeout, taskInfo.NetworkOverride)) h.tr.stateLock.Lock() h.tr.localState.TaskHandle = th diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go index f5342962b..95c87440e 100644 --- a/client/allocrunner/taskrunner/task_runner.go +++ b/client/allocrunner/taskrunner/task_runner.go @@ -883,7 +883,7 @@ func (tr *TaskRunner) runDriver() error { } tr.stateLock.Unlock() - tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), net)) + tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), tr.clientConfig.MaxKillTimeout, net)) // Emit an event that we started tr.UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted)) @@ -1182,7 +1182,7 @@ func (tr *TaskRunner) restoreHandle(taskHandle *drivers.TaskHandle, net *drivers } // Update driver handle on task runner - tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), net)) + tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), tr.clientConfig.MaxKillTimeout, net)) return true } diff --git a/client/config/testing.go b/client/config/testing.go index a326cfbd6..0204073fb 100644 --- a/client/config/testing.go +++ b/client/config/testing.go @@ -4,6 +4,7 @@ import ( "io/ioutil" "os" "path/filepath" + "time" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/helper" @@ -64,5 +65,9 @@ func TestClientConfig(t testing.T) (*Config, func()) { // Loosen GC threshold conf.GCDiskUsageThreshold = 98.0 conf.GCInodeUsageThreshold = 98.0 + + // Same as default; necessary for task Event messages + conf.MaxKillTimeout = 30 * time.Second + return conf, cleanup } diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 1304ddd43..d7748d95e 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -6976,7 +6976,7 @@ type Task struct { // task exits, other tasks will be gracefully terminated. Leader bool - // ShutdownDelay is the duration of the delay between deregistering a + // ShutdownDelay is the duration of the delay between de-registering a // task from Consul and sending it a signal to shutdown. See #2441 ShutdownDelay time.Duration @@ -8379,9 +8379,10 @@ func (e *TaskEvent) SetValidationError(err error) *TaskEvent { return e } -func (e *TaskEvent) SetKillTimeout(timeout time.Duration) *TaskEvent { - e.KillTimeout = timeout - e.Details["kill_timeout"] = timeout.String() +func (e *TaskEvent) SetKillTimeout(timeout, maxTimeout time.Duration) *TaskEvent { + actual := helper.Min(timeout, maxTimeout) + e.KillTimeout = actual + e.Details["kill_timeout"] = actual.String() return e } diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go index cb155d433..376501670 100644 --- a/nomad/structs/structs_test.go +++ b/nomad/structs/structs_test.go @@ -6137,7 +6137,8 @@ func TestTaskEventPopulate(t *testing.T) { {NewTaskEvent(TaskRestarting).SetRestartReason("Chaos Monkey did it"), "Chaos Monkey did it - Task restarting in 0s"}, {NewTaskEvent(TaskKilling), "Sent interrupt"}, {NewTaskEvent(TaskKilling).SetKillReason("Its time for you to die"), "Its time for you to die"}, - {NewTaskEvent(TaskKilling).SetKillTimeout(1 * time.Second), "Sent interrupt. Waiting 1s before force killing"}, + {NewTaskEvent(TaskKilling).SetKillTimeout(1*time.Second, 5*time.Second), "Sent interrupt. Waiting 1s before force killing"}, + {NewTaskEvent(TaskKilling).SetKillTimeout(10*time.Second, 5*time.Second), "Sent interrupt. Waiting 5s before force killing"}, {NewTaskEvent(TaskTerminated).SetExitCode(-1).SetSignal(3), "Exit Code: -1, Signal: 3"}, {NewTaskEvent(TaskTerminated).SetMessage("Goodbye"), "Exit Code: 0, Exit Message: \"Goodbye\""}, {NewTaskEvent(TaskKilled), "Task successfully killed"},