client: enforce max_kill_timeout client configuration

This PR fixes a bug where client configuration max_kill_timeout was
not being enforced. The feature was introduced in 9f44780 but seems
to have been removed during the major drivers refactoring.

We can make sure the value is enforced by pluming it through the DriverHandler,
which now uses the lesser of the task.killTimeout or client.maxKillTimeout.
Also updates Event.SetKillTimeout to require both the task.killTimeout and
client.maxKillTimeout so that we don't make the mistake of using the wrong
value - as it was being given only the task.killTimeout before.
This commit is contained in:
Seth Hoenig 2022-07-06 14:44:58 -05:00
parent 29c6b9dfdf
commit 5dd8aa3e27
9 changed files with 30 additions and 14 deletions

3
.changelog/13626.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
client: Fixed a bug where max_kill_timeout client config was ignored
```

View File

@ -620,7 +620,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState {
}
taskEvent := structs.NewTaskEvent(structs.TaskKilling)
taskEvent.SetKillTimeout(tr.Task().KillTimeout)
taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout)
err := tr.Kill(context.TODO(), taskEvent)
if err != nil && err != taskrunner.ErrTaskNotRunning {
ar.logger.Warn("error stopping leader task", "error", err, "task_name", name)
@ -643,7 +643,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState {
go func(name string, tr *taskrunner.TaskRunner) {
defer wg.Done()
taskEvent := structs.NewTaskEvent(structs.TaskKilling)
taskEvent.SetKillTimeout(tr.Task().KillTimeout)
taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout)
err := tr.Kill(context.TODO(), taskEvent)
if err != nil && err != taskrunner.ErrTaskNotRunning {
ar.logger.Warn("error stopping task", "error", err, "task_name", name)
@ -667,7 +667,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState {
go func(name string, tr *taskrunner.TaskRunner) {
defer wg.Done()
taskEvent := structs.NewTaskEvent(structs.TaskKilling)
taskEvent.SetKillTimeout(tr.Task().KillTimeout)
taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout)
err := tr.Kill(context.TODO(), taskEvent)
if err != nil && err != taskrunner.ErrTaskNotRunning {
ar.logger.Warn("error stopping sidecar task", "error", err, "task_name", name)

View File

@ -126,7 +126,7 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {
expectedKillingMsg := "Sent interrupt. Waiting 10ms before force killing"
if killingMsg != expectedKillingMsg {
return false, fmt.Errorf("Unexpected task event message - wanted %q. got %q", killingMsg, expectedKillingMsg)
return false, fmt.Errorf("Unexpected task event message - wanted %q. got %q", expectedKillingMsg, killingMsg)
}
// Task Two should be dead

View File

@ -6,18 +6,24 @@ import (
"time"
cstructs "github.com/hashicorp/nomad/client/structs"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/plugins/drivers"
)
// NewDriverHandle returns a handle for task operations on a specific task
func NewDriverHandle(driver drivers.DriverPlugin, taskID string, task *structs.Task, net *drivers.DriverNetwork) *DriverHandle {
func NewDriverHandle(
driver drivers.DriverPlugin,
taskID string,
task *structs.Task,
maxKillTimeout time.Duration,
net *drivers.DriverNetwork) *DriverHandle {
return &DriverHandle{
driver: driver,
net: net,
taskID: taskID,
killSignal: task.KillSignal,
killTimeout: task.KillTimeout,
killTimeout: helper.Min(task.KillTimeout, maxKillTimeout),
}
}

View File

@ -72,7 +72,7 @@ func (h *remoteTaskHook) Prestart(ctx context.Context, req *interfaces.TaskPrest
return nil
}
h.tr.setDriverHandle(NewDriverHandle(h.tr.driver, th.Config.ID, h.tr.Task(), taskInfo.NetworkOverride))
h.tr.setDriverHandle(NewDriverHandle(h.tr.driver, th.Config.ID, h.tr.Task(), h.tr.clientConfig.MaxKillTimeout, taskInfo.NetworkOverride))
h.tr.stateLock.Lock()
h.tr.localState.TaskHandle = th

View File

@ -883,7 +883,7 @@ func (tr *TaskRunner) runDriver() error {
}
tr.stateLock.Unlock()
tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), net))
tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), tr.clientConfig.MaxKillTimeout, net))
// Emit an event that we started
tr.UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted))
@ -1182,7 +1182,7 @@ func (tr *TaskRunner) restoreHandle(taskHandle *drivers.TaskHandle, net *drivers
}
// Update driver handle on task runner
tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), net))
tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), tr.clientConfig.MaxKillTimeout, net))
return true
}

View File

@ -4,6 +4,7 @@ import (
"io/ioutil"
"os"
"path/filepath"
"time"
"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/helper"
@ -64,5 +65,9 @@ func TestClientConfig(t testing.T) (*Config, func()) {
// Loosen GC threshold
conf.GCDiskUsageThreshold = 98.0
conf.GCInodeUsageThreshold = 98.0
// Same as default; necessary for task Event messages
conf.MaxKillTimeout = 30 * time.Second
return conf, cleanup
}

View File

@ -6976,7 +6976,7 @@ type Task struct {
// task exits, other tasks will be gracefully terminated.
Leader bool
// ShutdownDelay is the duration of the delay between deregistering a
// ShutdownDelay is the duration of the delay between de-registering a
// task from Consul and sending it a signal to shutdown. See #2441
ShutdownDelay time.Duration
@ -8379,9 +8379,10 @@ func (e *TaskEvent) SetValidationError(err error) *TaskEvent {
return e
}
func (e *TaskEvent) SetKillTimeout(timeout time.Duration) *TaskEvent {
e.KillTimeout = timeout
e.Details["kill_timeout"] = timeout.String()
func (e *TaskEvent) SetKillTimeout(timeout, maxTimeout time.Duration) *TaskEvent {
actual := helper.Min(timeout, maxTimeout)
e.KillTimeout = actual
e.Details["kill_timeout"] = actual.String()
return e
}

View File

@ -6137,7 +6137,8 @@ func TestTaskEventPopulate(t *testing.T) {
{NewTaskEvent(TaskRestarting).SetRestartReason("Chaos Monkey did it"), "Chaos Monkey did it - Task restarting in 0s"},
{NewTaskEvent(TaskKilling), "Sent interrupt"},
{NewTaskEvent(TaskKilling).SetKillReason("Its time for you to die"), "Its time for you to die"},
{NewTaskEvent(TaskKilling).SetKillTimeout(1 * time.Second), "Sent interrupt. Waiting 1s before force killing"},
{NewTaskEvent(TaskKilling).SetKillTimeout(1*time.Second, 5*time.Second), "Sent interrupt. Waiting 1s before force killing"},
{NewTaskEvent(TaskKilling).SetKillTimeout(10*time.Second, 5*time.Second), "Sent interrupt. Waiting 5s before force killing"},
{NewTaskEvent(TaskTerminated).SetExitCode(-1).SetSignal(3), "Exit Code: -1, Signal: 3"},
{NewTaskEvent(TaskTerminated).SetMessage("Goodbye"), "Exit Code: 0, Exit Message: \"Goodbye\""},
{NewTaskEvent(TaskKilled), "Task successfully killed"},