client: enforce max_kill_timeout client configuration
This PR fixes a bug where client configuration max_kill_timeout was not being enforced. The feature was introduced in 9f44780 but seems to have been removed during the major drivers refactoring. We can make sure the value is enforced by pluming it through the DriverHandler, which now uses the lesser of the task.killTimeout or client.maxKillTimeout. Also updates Event.SetKillTimeout to require both the task.killTimeout and client.maxKillTimeout so that we don't make the mistake of using the wrong value - as it was being given only the task.killTimeout before.
This commit is contained in:
parent
29c6b9dfdf
commit
5dd8aa3e27
|
@ -0,0 +1,3 @@
|
|||
```release-note:bug
|
||||
client: Fixed a bug where max_kill_timeout client config was ignored
|
||||
```
|
|
@ -620,7 +620,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState {
|
|||
}
|
||||
|
||||
taskEvent := structs.NewTaskEvent(structs.TaskKilling)
|
||||
taskEvent.SetKillTimeout(tr.Task().KillTimeout)
|
||||
taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout)
|
||||
err := tr.Kill(context.TODO(), taskEvent)
|
||||
if err != nil && err != taskrunner.ErrTaskNotRunning {
|
||||
ar.logger.Warn("error stopping leader task", "error", err, "task_name", name)
|
||||
|
@ -643,7 +643,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState {
|
|||
go func(name string, tr *taskrunner.TaskRunner) {
|
||||
defer wg.Done()
|
||||
taskEvent := structs.NewTaskEvent(structs.TaskKilling)
|
||||
taskEvent.SetKillTimeout(tr.Task().KillTimeout)
|
||||
taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout)
|
||||
err := tr.Kill(context.TODO(), taskEvent)
|
||||
if err != nil && err != taskrunner.ErrTaskNotRunning {
|
||||
ar.logger.Warn("error stopping task", "error", err, "task_name", name)
|
||||
|
@ -667,7 +667,7 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState {
|
|||
go func(name string, tr *taskrunner.TaskRunner) {
|
||||
defer wg.Done()
|
||||
taskEvent := structs.NewTaskEvent(structs.TaskKilling)
|
||||
taskEvent.SetKillTimeout(tr.Task().KillTimeout)
|
||||
taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout)
|
||||
err := tr.Kill(context.TODO(), taskEvent)
|
||||
if err != nil && err != taskrunner.ErrTaskNotRunning {
|
||||
ar.logger.Warn("error stopping sidecar task", "error", err, "task_name", name)
|
||||
|
|
|
@ -126,7 +126,7 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {
|
|||
|
||||
expectedKillingMsg := "Sent interrupt. Waiting 10ms before force killing"
|
||||
if killingMsg != expectedKillingMsg {
|
||||
return false, fmt.Errorf("Unexpected task event message - wanted %q. got %q", killingMsg, expectedKillingMsg)
|
||||
return false, fmt.Errorf("Unexpected task event message - wanted %q. got %q", expectedKillingMsg, killingMsg)
|
||||
}
|
||||
|
||||
// Task Two should be dead
|
||||
|
|
|
@ -6,18 +6,24 @@ import (
|
|||
"time"
|
||||
|
||||
cstructs "github.com/hashicorp/nomad/client/structs"
|
||||
"github.com/hashicorp/nomad/helper"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
"github.com/hashicorp/nomad/plugins/drivers"
|
||||
)
|
||||
|
||||
// NewDriverHandle returns a handle for task operations on a specific task
|
||||
func NewDriverHandle(driver drivers.DriverPlugin, taskID string, task *structs.Task, net *drivers.DriverNetwork) *DriverHandle {
|
||||
func NewDriverHandle(
|
||||
driver drivers.DriverPlugin,
|
||||
taskID string,
|
||||
task *structs.Task,
|
||||
maxKillTimeout time.Duration,
|
||||
net *drivers.DriverNetwork) *DriverHandle {
|
||||
return &DriverHandle{
|
||||
driver: driver,
|
||||
net: net,
|
||||
taskID: taskID,
|
||||
killSignal: task.KillSignal,
|
||||
killTimeout: task.KillTimeout,
|
||||
killTimeout: helper.Min(task.KillTimeout, maxKillTimeout),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -72,7 +72,7 @@ func (h *remoteTaskHook) Prestart(ctx context.Context, req *interfaces.TaskPrest
|
|||
return nil
|
||||
}
|
||||
|
||||
h.tr.setDriverHandle(NewDriverHandle(h.tr.driver, th.Config.ID, h.tr.Task(), taskInfo.NetworkOverride))
|
||||
h.tr.setDriverHandle(NewDriverHandle(h.tr.driver, th.Config.ID, h.tr.Task(), h.tr.clientConfig.MaxKillTimeout, taskInfo.NetworkOverride))
|
||||
|
||||
h.tr.stateLock.Lock()
|
||||
h.tr.localState.TaskHandle = th
|
||||
|
|
|
@ -883,7 +883,7 @@ func (tr *TaskRunner) runDriver() error {
|
|||
}
|
||||
tr.stateLock.Unlock()
|
||||
|
||||
tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), net))
|
||||
tr.setDriverHandle(NewDriverHandle(tr.driver, taskConfig.ID, tr.Task(), tr.clientConfig.MaxKillTimeout, net))
|
||||
|
||||
// Emit an event that we started
|
||||
tr.UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted))
|
||||
|
@ -1182,7 +1182,7 @@ func (tr *TaskRunner) restoreHandle(taskHandle *drivers.TaskHandle, net *drivers
|
|||
}
|
||||
|
||||
// Update driver handle on task runner
|
||||
tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), net))
|
||||
tr.setDriverHandle(NewDriverHandle(tr.driver, taskHandle.Config.ID, tr.Task(), tr.clientConfig.MaxKillTimeout, net))
|
||||
return true
|
||||
}
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ import (
|
|||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/nomad/ci"
|
||||
"github.com/hashicorp/nomad/helper"
|
||||
|
@ -64,5 +65,9 @@ func TestClientConfig(t testing.T) (*Config, func()) {
|
|||
// Loosen GC threshold
|
||||
conf.GCDiskUsageThreshold = 98.0
|
||||
conf.GCInodeUsageThreshold = 98.0
|
||||
|
||||
// Same as default; necessary for task Event messages
|
||||
conf.MaxKillTimeout = 30 * time.Second
|
||||
|
||||
return conf, cleanup
|
||||
}
|
||||
|
|
|
@ -6976,7 +6976,7 @@ type Task struct {
|
|||
// task exits, other tasks will be gracefully terminated.
|
||||
Leader bool
|
||||
|
||||
// ShutdownDelay is the duration of the delay between deregistering a
|
||||
// ShutdownDelay is the duration of the delay between de-registering a
|
||||
// task from Consul and sending it a signal to shutdown. See #2441
|
||||
ShutdownDelay time.Duration
|
||||
|
||||
|
@ -8379,9 +8379,10 @@ func (e *TaskEvent) SetValidationError(err error) *TaskEvent {
|
|||
return e
|
||||
}
|
||||
|
||||
func (e *TaskEvent) SetKillTimeout(timeout time.Duration) *TaskEvent {
|
||||
e.KillTimeout = timeout
|
||||
e.Details["kill_timeout"] = timeout.String()
|
||||
func (e *TaskEvent) SetKillTimeout(timeout, maxTimeout time.Duration) *TaskEvent {
|
||||
actual := helper.Min(timeout, maxTimeout)
|
||||
e.KillTimeout = actual
|
||||
e.Details["kill_timeout"] = actual.String()
|
||||
return e
|
||||
}
|
||||
|
||||
|
|
|
@ -6137,7 +6137,8 @@ func TestTaskEventPopulate(t *testing.T) {
|
|||
{NewTaskEvent(TaskRestarting).SetRestartReason("Chaos Monkey did it"), "Chaos Monkey did it - Task restarting in 0s"},
|
||||
{NewTaskEvent(TaskKilling), "Sent interrupt"},
|
||||
{NewTaskEvent(TaskKilling).SetKillReason("Its time for you to die"), "Its time for you to die"},
|
||||
{NewTaskEvent(TaskKilling).SetKillTimeout(1 * time.Second), "Sent interrupt. Waiting 1s before force killing"},
|
||||
{NewTaskEvent(TaskKilling).SetKillTimeout(1*time.Second, 5*time.Second), "Sent interrupt. Waiting 1s before force killing"},
|
||||
{NewTaskEvent(TaskKilling).SetKillTimeout(10*time.Second, 5*time.Second), "Sent interrupt. Waiting 5s before force killing"},
|
||||
{NewTaskEvent(TaskTerminated).SetExitCode(-1).SetSignal(3), "Exit Code: -1, Signal: 3"},
|
||||
{NewTaskEvent(TaskTerminated).SetMessage("Goodbye"), "Exit Code: 0, Exit Message: \"Goodbye\""},
|
||||
{NewTaskEvent(TaskKilled), "Task successfully killed"},
|
||||
|
|
Loading…
Reference in New Issue