2018-10-18 20:39:02 +00:00
|
|
|
package taskrunner
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
2019-12-06 20:46:46 +00:00
|
|
|
"errors"
|
2018-10-18 20:39:02 +00:00
|
|
|
"fmt"
|
2019-01-15 23:19:51 +00:00
|
|
|
"io/ioutil"
|
2019-02-12 23:48:04 +00:00
|
|
|
"net/http"
|
|
|
|
"net/http/httptest"
|
|
|
|
"os"
|
2018-10-18 20:39:02 +00:00
|
|
|
"path/filepath"
|
2019-02-13 22:51:05 +00:00
|
|
|
"strings"
|
2018-10-18 20:39:02 +00:00
|
|
|
"testing"
|
2018-11-06 01:39:02 +00:00
|
|
|
"time"
|
2018-10-18 20:39:02 +00:00
|
|
|
|
2019-01-15 23:19:51 +00:00
|
|
|
"github.com/golang/snappy"
|
2022-03-15 12:42:43 +00:00
|
|
|
"github.com/hashicorp/nomad/ci"
|
2018-10-18 20:39:02 +00:00
|
|
|
"github.com/hashicorp/nomad/client/allocdir"
|
2018-11-27 19:53:47 +00:00
|
|
|
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
|
2022-05-03 22:38:32 +00:00
|
|
|
"github.com/hashicorp/nomad/client/allocrunner/taskrunner/getter"
|
2018-10-18 20:39:02 +00:00
|
|
|
"github.com/hashicorp/nomad/client/config"
|
|
|
|
consulapi "github.com/hashicorp/nomad/client/consul"
|
2018-11-16 23:29:59 +00:00
|
|
|
"github.com/hashicorp/nomad/client/devicemanager"
|
2022-04-19 14:13:38 +00:00
|
|
|
"github.com/hashicorp/nomad/client/lib/cgutil"
|
2018-11-28 03:42:22 +00:00
|
|
|
"github.com/hashicorp/nomad/client/pluginmanager/drivermanager"
|
2022-03-15 08:38:30 +00:00
|
|
|
regMock "github.com/hashicorp/nomad/client/serviceregistration/mock"
|
2022-03-21 09:29:57 +00:00
|
|
|
"github.com/hashicorp/nomad/client/serviceregistration/wrapper"
|
2018-10-18 20:39:02 +00:00
|
|
|
cstate "github.com/hashicorp/nomad/client/state"
|
2019-03-21 17:57:23 +00:00
|
|
|
ctestutil "github.com/hashicorp/nomad/client/testutil"
|
2018-10-18 20:39:02 +00:00
|
|
|
"github.com/hashicorp/nomad/client/vaultclient"
|
test: port TestTaskRunner_CheckWatcher_Restart
Added ability to adjust the number of events the TaskRunner keeps as
there's no way to observe all events otherwise.
Task events differ slightly from 0.8 because 0.9 emits Terminated every
time a task exits instead of only when it exits on its own (not due to
restart or kill).
0.9 does not emit Killing/Killed for restarts like 0.8 which seems fine
as `Restart Signaled/Terminated/Restarting` is more descriptive.
Original v0.8 events emitted:
```
expected := []string{
"Received",
"Task Setup",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Restarting",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Restarting",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Not Restarting",
}
```
2019-01-18 16:30:44 +00:00
|
|
|
agentconsul "github.com/hashicorp/nomad/command/agent/consul"
|
2018-11-06 01:39:02 +00:00
|
|
|
mockdriver "github.com/hashicorp/nomad/drivers/mock"
|
2019-05-08 06:04:40 +00:00
|
|
|
"github.com/hashicorp/nomad/drivers/rawexec"
|
2021-12-13 19:54:53 +00:00
|
|
|
"github.com/hashicorp/nomad/helper"
|
2018-10-18 20:39:02 +00:00
|
|
|
"github.com/hashicorp/nomad/helper/testlog"
|
2020-01-15 15:29:47 +00:00
|
|
|
"github.com/hashicorp/nomad/helper/uuid"
|
2018-10-18 20:39:02 +00:00
|
|
|
"github.com/hashicorp/nomad/nomad/mock"
|
|
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
2018-11-16 23:29:59 +00:00
|
|
|
"github.com/hashicorp/nomad/plugins/device"
|
2019-05-08 06:04:40 +00:00
|
|
|
"github.com/hashicorp/nomad/plugins/drivers"
|
2018-10-18 20:39:02 +00:00
|
|
|
"github.com/hashicorp/nomad/testutil"
|
2022-04-19 14:13:38 +00:00
|
|
|
"github.com/kr/pretty"
|
|
|
|
"github.com/stretchr/testify/assert"
|
|
|
|
"github.com/stretchr/testify/require"
|
2018-10-18 20:39:02 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
type MockTaskStateUpdater struct {
|
|
|
|
ch chan struct{}
|
|
|
|
}
|
|
|
|
|
|
|
|
func NewMockTaskStateUpdater() *MockTaskStateUpdater {
|
|
|
|
return &MockTaskStateUpdater{
|
|
|
|
ch: make(chan struct{}, 1),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (m *MockTaskStateUpdater) TaskStateUpdated() {
|
|
|
|
select {
|
|
|
|
case m.ch <- struct{}{}:
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// testTaskRunnerConfig returns a taskrunner.Config for the given alloc+task
|
|
|
|
// plus a cleanup func.
|
|
|
|
func testTaskRunnerConfig(t *testing.T, alloc *structs.Allocation, taskName string) (*Config, func()) {
|
|
|
|
logger := testlog.HCLogger(t)
|
|
|
|
clientConf, cleanup := config.TestClientConfig(t)
|
|
|
|
|
|
|
|
// Find the task
|
|
|
|
var thisTask *structs.Task
|
|
|
|
for _, tg := range alloc.Job.TaskGroups {
|
|
|
|
for _, task := range tg.Tasks {
|
|
|
|
if task.Name == taskName {
|
|
|
|
if thisTask != nil {
|
|
|
|
cleanup()
|
|
|
|
t.Fatalf("multiple tasks named %q; cannot use this helper", taskName)
|
|
|
|
}
|
|
|
|
thisTask = task
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if thisTask == nil {
|
|
|
|
cleanup()
|
|
|
|
t.Fatalf("could not find task %q", taskName)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create the alloc dir + task dir
|
2021-10-18 17:32:41 +00:00
|
|
|
allocDir := allocdir.NewAllocDir(logger, clientConf.AllocDir, alloc.ID)
|
2018-10-18 20:39:02 +00:00
|
|
|
if err := allocDir.Build(); err != nil {
|
|
|
|
cleanup()
|
|
|
|
t.Fatalf("error building alloc dir: %v", err)
|
|
|
|
}
|
|
|
|
taskDir := allocDir.NewTaskDir(taskName)
|
|
|
|
|
2022-04-19 14:13:38 +00:00
|
|
|
// Compute the name of the v2 cgroup in case we need it in creation, configuration, and cleanup
|
|
|
|
cgroup := filepath.Join(cgutil.CgroupRoot, "testing.slice", cgutil.CgroupScope(alloc.ID, taskName))
|
|
|
|
|
|
|
|
// Create the cgroup if we are in v2 mode
|
|
|
|
if cgutil.UseV2 {
|
|
|
|
if err := os.MkdirAll(cgroup, 0755); err != nil {
|
|
|
|
t.Fatalf("failed to setup v2 cgroup for test: %v:", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-10-18 20:39:02 +00:00
|
|
|
trCleanup := func() {
|
|
|
|
if err := allocDir.Destroy(); err != nil {
|
|
|
|
t.Logf("error destroying alloc dir: %v", err)
|
|
|
|
}
|
2022-04-19 14:13:38 +00:00
|
|
|
|
|
|
|
// Cleanup the cgroup if we are in v2 mode
|
|
|
|
if cgutil.UseV2 {
|
|
|
|
_ = os.RemoveAll(cgroup)
|
|
|
|
}
|
|
|
|
|
2018-10-18 20:39:02 +00:00
|
|
|
cleanup()
|
|
|
|
}
|
|
|
|
|
2021-12-13 19:54:53 +00:00
|
|
|
shutdownDelayCtx, shutdownDelayCancelFn := context.WithCancel(context.Background())
|
|
|
|
|
2020-01-29 02:42:35 +00:00
|
|
|
// Create a closed channel to mock TaskHookCoordinator.startConditionForTask.
|
|
|
|
// Closed channel indicates this task is not blocked on prestart hooks.
|
|
|
|
closedCh := make(chan struct{})
|
|
|
|
close(closedCh)
|
|
|
|
|
2022-03-21 09:29:57 +00:00
|
|
|
// Set up the Nomad and Consul registration providers along with the wrapper.
|
|
|
|
consulRegMock := regMock.NewServiceRegistrationHandler(logger)
|
|
|
|
nomadRegMock := regMock.NewServiceRegistrationHandler(logger)
|
|
|
|
wrapperMock := wrapper.NewHandlerWrapper(logger, consulRegMock, nomadRegMock)
|
|
|
|
|
2018-10-18 20:39:02 +00:00
|
|
|
conf := &Config{
|
2021-12-13 19:54:53 +00:00
|
|
|
Alloc: alloc,
|
|
|
|
ClientConfig: clientConf,
|
|
|
|
Task: thisTask,
|
|
|
|
TaskDir: taskDir,
|
|
|
|
Logger: clientConf.Logger,
|
2022-03-21 09:29:57 +00:00
|
|
|
Consul: consulRegMock,
|
2021-12-13 19:54:53 +00:00
|
|
|
ConsulSI: consulapi.NewMockServiceIdentitiesClient(),
|
|
|
|
Vault: vaultclient.NewMockVaultClient(),
|
|
|
|
StateDB: cstate.NoopDB{},
|
|
|
|
StateUpdater: NewMockTaskStateUpdater(),
|
|
|
|
DeviceManager: devicemanager.NoopMockManager(),
|
|
|
|
DriverManager: drivermanager.TestDriverManager(t),
|
|
|
|
ServersContactedCh: make(chan struct{}),
|
|
|
|
StartConditionMetCtx: closedCh,
|
|
|
|
ShutdownDelayCtx: shutdownDelayCtx,
|
|
|
|
ShutdownDelayCancelFn: shutdownDelayCancelFn,
|
2022-03-21 09:29:57 +00:00
|
|
|
ServiceRegWrapper: wrapperMock,
|
2022-05-03 22:38:32 +00:00
|
|
|
Getter: getter.TestDefaultGetter(t),
|
2018-10-18 20:39:02 +00:00
|
|
|
}
|
2022-04-19 14:13:38 +00:00
|
|
|
|
|
|
|
// Set the cgroup path getter if we are in v2 mode
|
|
|
|
if cgutil.UseV2 {
|
|
|
|
conf.CpusetCgroupPathGetter = func(context.Context) (string, error) {
|
|
|
|
return filepath.Join(cgutil.CgroupRoot, "testing.slice", alloc.ID, thisTask.Name), nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-10-18 20:39:02 +00:00
|
|
|
return conf, trCleanup
|
|
|
|
}
|
|
|
|
|
2019-02-13 16:25:25 +00:00
|
|
|
// runTestTaskRunner runs a TaskRunner and returns its configuration as well as
|
|
|
|
// a cleanup function that ensures the runner is stopped and cleaned up. Tests
|
|
|
|
// which need to change the Config *must* use testTaskRunnerConfig instead.
|
|
|
|
func runTestTaskRunner(t *testing.T, alloc *structs.Allocation, taskName string) (*TaskRunner, *Config, func()) {
|
|
|
|
config, cleanup := testTaskRunnerConfig(t, alloc, taskName)
|
|
|
|
|
|
|
|
tr, err := NewTaskRunner(config)
|
|
|
|
require.NoError(t, err)
|
|
|
|
go tr.Run()
|
|
|
|
|
|
|
|
return tr, config, func() {
|
|
|
|
tr.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
|
|
|
cleanup()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-03-26 20:13:05 +00:00
|
|
|
func TestTaskRunner_BuildTaskConfig_CPU_Memory(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2021-03-26 20:13:05 +00:00
|
|
|
|
|
|
|
cases := []struct {
|
|
|
|
name string
|
|
|
|
cpu int64
|
|
|
|
memoryMB int64
|
|
|
|
memoryMaxMB int64
|
|
|
|
expectedLinuxMemoryMB int64
|
|
|
|
}{
|
|
|
|
{
|
|
|
|
name: "plain no max",
|
|
|
|
cpu: 100,
|
|
|
|
memoryMB: 100,
|
|
|
|
memoryMaxMB: 0,
|
|
|
|
expectedLinuxMemoryMB: 100,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "plain with max=reserve",
|
|
|
|
cpu: 100,
|
|
|
|
memoryMB: 100,
|
|
|
|
memoryMaxMB: 100,
|
|
|
|
expectedLinuxMemoryMB: 100,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
name: "plain with max>reserve",
|
|
|
|
cpu: 100,
|
|
|
|
memoryMB: 100,
|
|
|
|
memoryMaxMB: 200,
|
|
|
|
expectedLinuxMemoryMB: 200,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, c := range cases {
|
|
|
|
t.Run(c.name, func(t *testing.T) {
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
alloc.Job.TaskGroups[0].Count = 1
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Driver = "mock_driver"
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"run_for": "2s",
|
|
|
|
}
|
|
|
|
res := alloc.AllocatedResources.Tasks[task.Name]
|
|
|
|
res.Cpu.CpuShares = c.cpu
|
|
|
|
res.Memory.MemoryMB = c.memoryMB
|
|
|
|
res.Memory.MemoryMaxMB = c.memoryMaxMB
|
|
|
|
|
|
|
|
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
|
|
|
conf.StateDB = cstate.NewMemDB(conf.Logger) // "persist" state between task runners
|
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
// Run the first TaskRunner
|
|
|
|
tr, err := NewTaskRunner(conf)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
tc := tr.buildTaskConfig()
|
|
|
|
require.Equal(t, c.cpu, tc.Resources.LinuxResources.CPUShares)
|
|
|
|
require.Equal(t, c.expectedLinuxMemoryMB*1024*1024, tc.Resources.LinuxResources.MemoryLimitBytes)
|
|
|
|
|
|
|
|
require.Equal(t, c.cpu, tc.Resources.NomadResources.Cpu.CpuShares)
|
|
|
|
require.Equal(t, c.memoryMB, tc.Resources.NomadResources.Memory.MemoryMB)
|
|
|
|
require.Equal(t, c.memoryMaxMB, tc.Resources.NomadResources.Memory.MemoryMaxMB)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
drivers: Capture exit code when task is killed (#10494)
This commit ensures Nomad captures the task code more reliably even when the task is killed. This issue affect to `raw_exec` driver, as noted in https://github.com/hashicorp/nomad/issues/10430 .
We fix this issue by ensuring that the TaskRunner only calls `driver.WaitTask` once. The TaskRunner monitors the completion of the task by calling `driver.WaitTask` which should return the task exit code on completion. However, it also could return a "context canceled" error if the agent/executor is shutdown.
Previously, when a task is to be stopped, the killTask path makes two WaitTask calls, and the second returns "context canceled" occasionally because of a "race" in task shutting down and depending on driver, and how fast it shuts down after task completes.
By having a single WaitTask call and consistently waiting for the task, we ensure we capture the exit code reliably before the executor is shutdown or the contexts expired.
I opted to change the TaskRunner implementation to avoid changing the driver interface or requiring 3rd party drivers to update.
Additionally, the PR ensures that attempts to kill the task terminate when the task "naturally" dies. Without this change, if the task dies at the right moment, the `killTask` call may retry to kill an already-dead task for up to 5 minutes before giving up.
2021-05-04 14:54:00 +00:00
|
|
|
// TestTaskRunner_Stop_ExitCode asserts that the exit code is captured on a task, even if it's stopped
|
|
|
|
func TestTaskRunner_Stop_ExitCode(t *testing.T) {
|
|
|
|
ctestutil.ExecCompatible(t)
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
drivers: Capture exit code when task is killed (#10494)
This commit ensures Nomad captures the task code more reliably even when the task is killed. This issue affect to `raw_exec` driver, as noted in https://github.com/hashicorp/nomad/issues/10430 .
We fix this issue by ensuring that the TaskRunner only calls `driver.WaitTask` once. The TaskRunner monitors the completion of the task by calling `driver.WaitTask` which should return the task exit code on completion. However, it also could return a "context canceled" error if the agent/executor is shutdown.
Previously, when a task is to be stopped, the killTask path makes two WaitTask calls, and the second returns "context canceled" occasionally because of a "race" in task shutting down and depending on driver, and how fast it shuts down after task completes.
By having a single WaitTask call and consistently waiting for the task, we ensure we capture the exit code reliably before the executor is shutdown or the contexts expired.
I opted to change the TaskRunner implementation to avoid changing the driver interface or requiring 3rd party drivers to update.
Additionally, the PR ensures that attempts to kill the task terminate when the task "naturally" dies. Without this change, if the task dies at the right moment, the `killTask` call may retry to kill an already-dead task for up to 5 minutes before giving up.
2021-05-04 14:54:00 +00:00
|
|
|
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
alloc.Job.TaskGroups[0].Count = 1
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.KillSignal = "SIGTERM"
|
|
|
|
task.Driver = "raw_exec"
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"command": "/bin/sleep",
|
|
|
|
"args": []string{"1000"},
|
|
|
|
}
|
cgroups: make sure cgroup still exists after task restart
This PR modifies raw_exec and exec to ensure the cgroup for a task
they are driving still exists during a task restart. These drivers
have the same bug but with different root cause.
For raw_exec, we were removing the cgroup in 2 places - the cpuset
manager, and in the unix containment implementation (the thing that
uses freezer cgroup to clean house). During a task restart, the
containment would remove the cgroup, and when the task runner hooks
went to start again would block on waiting for the cgroup to exist,
which will never happen, because it gets created by the cpuset manager
which only runs as an alloc pre-start hook. The fix here is to simply
not delete the cgroup in the containment implementation; killing the
PIDs is enough. The removal happens in the cpuset manager later anyway.
For exec, it's the same idea, except DestroyTask is called on task
failure, which in turn calls into libcontainer, which in turn deletes
the cgroup. In this case we do not have control over the deletion of
the cgroup, so instead we hack the cgroup back into life after the
call to DestroyTask.
All of this only applies to cgroups v2.
2022-05-04 18:51:53 +00:00
|
|
|
task.Env = map[string]string{
|
|
|
|
"NOMAD_PARENT_CGROUP": "nomad.slice",
|
|
|
|
"NOMAD_ALLOC_ID": alloc.ID,
|
|
|
|
"NOMAD_TASK_NAME": task.Name,
|
|
|
|
}
|
drivers: Capture exit code when task is killed (#10494)
This commit ensures Nomad captures the task code more reliably even when the task is killed. This issue affect to `raw_exec` driver, as noted in https://github.com/hashicorp/nomad/issues/10430 .
We fix this issue by ensuring that the TaskRunner only calls `driver.WaitTask` once. The TaskRunner monitors the completion of the task by calling `driver.WaitTask` which should return the task exit code on completion. However, it also could return a "context canceled" error if the agent/executor is shutdown.
Previously, when a task is to be stopped, the killTask path makes two WaitTask calls, and the second returns "context canceled" occasionally because of a "race" in task shutting down and depending on driver, and how fast it shuts down after task completes.
By having a single WaitTask call and consistently waiting for the task, we ensure we capture the exit code reliably before the executor is shutdown or the contexts expired.
I opted to change the TaskRunner implementation to avoid changing the driver interface or requiring 3rd party drivers to update.
Additionally, the PR ensures that attempts to kill the task terminate when the task "naturally" dies. Without this change, if the task dies at the right moment, the `killTask` call may retry to kill an already-dead task for up to 5 minutes before giving up.
2021-05-04 14:54:00 +00:00
|
|
|
|
|
|
|
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
// Run the first TaskRunner
|
|
|
|
tr, err := NewTaskRunner(conf)
|
|
|
|
require.NoError(t, err)
|
|
|
|
go tr.Run()
|
|
|
|
|
|
|
|
defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
|
|
|
|
|
|
|
// Wait for it to be running
|
|
|
|
testWaitForTaskToStart(t, tr)
|
|
|
|
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
|
|
|
defer cancel()
|
|
|
|
|
|
|
|
err = tr.Kill(ctx, structs.NewTaskEvent("shutdown"))
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
var exitEvent *structs.TaskEvent
|
|
|
|
state := tr.TaskState()
|
|
|
|
for _, e := range state.Events {
|
|
|
|
if e.Type == structs.TaskTerminated {
|
|
|
|
exitEvent = e
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
require.NotNilf(t, exitEvent, "exit event not found: %v", state.Events)
|
|
|
|
|
|
|
|
require.Equal(t, 143, exitEvent.ExitCode)
|
|
|
|
require.Equal(t, 15, exitEvent.Signal)
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2019-05-08 06:04:40 +00:00
|
|
|
// TestTaskRunner_Restore_Running asserts restoring a running task does not
|
|
|
|
// rerun the task.
|
2018-10-18 20:39:02 +00:00
|
|
|
func TestTaskRunner_Restore_Running(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2018-10-18 20:39:02 +00:00
|
|
|
require := require.New(t)
|
|
|
|
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
alloc.Job.TaskGroups[0].Count = 1
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Driver = "mock_driver"
|
|
|
|
task.Config = map[string]interface{}{
|
2018-11-13 02:52:12 +00:00
|
|
|
"run_for": "2s",
|
2018-10-18 20:39:02 +00:00
|
|
|
}
|
2018-11-16 23:29:59 +00:00
|
|
|
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
2019-03-01 23:02:53 +00:00
|
|
|
conf.StateDB = cstate.NewMemDB(conf.Logger) // "persist" state between task runners
|
2018-10-18 20:39:02 +00:00
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
// Run the first TaskRunner
|
|
|
|
origTR, err := NewTaskRunner(conf)
|
|
|
|
require.NoError(err)
|
|
|
|
go origTR.Run()
|
|
|
|
defer origTR.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
|
|
|
|
|
|
|
// Wait for it to be running
|
2019-01-17 23:01:49 +00:00
|
|
|
testWaitForTaskToStart(t, origTR)
|
2018-10-18 20:39:02 +00:00
|
|
|
|
|
|
|
// Cause TR to exit without shutting down task
|
2018-11-14 18:29:07 +00:00
|
|
|
origTR.Shutdown()
|
2018-10-18 20:39:02 +00:00
|
|
|
|
|
|
|
// Start a new TaskRunner and make sure it does not rerun the task
|
|
|
|
newTR, err := NewTaskRunner(conf)
|
|
|
|
require.NoError(err)
|
|
|
|
|
|
|
|
// Do the Restore
|
|
|
|
require.NoError(newTR.Restore())
|
|
|
|
|
|
|
|
go newTR.Run()
|
|
|
|
defer newTR.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
|
|
|
|
|
|
|
// Wait for new task runner to exit when the process does
|
|
|
|
<-newTR.WaitCh()
|
|
|
|
|
2018-11-01 04:23:44 +00:00
|
|
|
// Assert that the process was only started once
|
2018-10-18 20:39:02 +00:00
|
|
|
started := 0
|
|
|
|
state := newTR.TaskState()
|
|
|
|
require.Equal(structs.TaskStateDead, state.State)
|
|
|
|
for _, ev := range state.Events {
|
2018-11-01 04:23:44 +00:00
|
|
|
if ev.Type == structs.TaskStarted {
|
2018-10-18 20:39:02 +00:00
|
|
|
started++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
assert.Equal(t, 1, started)
|
|
|
|
}
|
2018-11-06 01:39:02 +00:00
|
|
|
|
2019-05-08 06:04:40 +00:00
|
|
|
// setupRestoreFailureTest starts a service, shuts down the task runner, and
|
|
|
|
// kills the task before restarting a new TaskRunner. The new TaskRunner is
|
|
|
|
// returned once it is running and waiting in pending along with a cleanup
|
|
|
|
// func.
|
2019-05-10 15:54:35 +00:00
|
|
|
func setupRestoreFailureTest(t *testing.T, alloc *structs.Allocation) (*TaskRunner, *Config, func()) {
|
2019-05-08 06:04:40 +00:00
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Driver = "raw_exec"
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"command": "sleep",
|
|
|
|
"args": []string{"30"},
|
|
|
|
}
|
cgroups: make sure cgroup still exists after task restart
This PR modifies raw_exec and exec to ensure the cgroup for a task
they are driving still exists during a task restart. These drivers
have the same bug but with different root cause.
For raw_exec, we were removing the cgroup in 2 places - the cpuset
manager, and in the unix containment implementation (the thing that
uses freezer cgroup to clean house). During a task restart, the
containment would remove the cgroup, and when the task runner hooks
went to start again would block on waiting for the cgroup to exist,
which will never happen, because it gets created by the cpuset manager
which only runs as an alloc pre-start hook. The fix here is to simply
not delete the cgroup in the containment implementation; killing the
PIDs is enough. The removal happens in the cpuset manager later anyway.
For exec, it's the same idea, except DestroyTask is called on task
failure, which in turn calls into libcontainer, which in turn deletes
the cgroup. In this case we do not have control over the deletion of
the cgroup, so instead we hack the cgroup back into life after the
call to DestroyTask.
All of this only applies to cgroups v2.
2022-05-04 18:51:53 +00:00
|
|
|
task.Env = map[string]string{
|
|
|
|
"NOMAD_PARENT_CGROUP": "nomad.slice",
|
|
|
|
"NOMAD_ALLOC_ID": alloc.ID,
|
|
|
|
"NOMAD_TASK_NAME": task.Name,
|
|
|
|
}
|
2019-05-08 06:04:40 +00:00
|
|
|
conf, cleanup1 := testTaskRunnerConfig(t, alloc, task.Name)
|
|
|
|
conf.StateDB = cstate.NewMemDB(conf.Logger) // "persist" state between runs
|
|
|
|
|
|
|
|
// Run the first TaskRunner
|
|
|
|
origTR, err := NewTaskRunner(conf)
|
|
|
|
require.NoError(t, err)
|
|
|
|
go origTR.Run()
|
|
|
|
cleanup2 := func() {
|
|
|
|
origTR.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
|
|
|
cleanup1()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Wait for it to be running
|
|
|
|
testWaitForTaskToStart(t, origTR)
|
|
|
|
|
|
|
|
handle := origTR.getDriverHandle()
|
|
|
|
require.NotNil(t, handle)
|
|
|
|
taskID := handle.taskID
|
|
|
|
|
|
|
|
// Cause TR to exit without shutting down task
|
|
|
|
origTR.Shutdown()
|
|
|
|
|
2019-05-10 15:54:35 +00:00
|
|
|
// Get the driver
|
2019-05-08 06:04:40 +00:00
|
|
|
driverPlugin, err := conf.DriverManager.Dispense(rawexec.PluginID.Name)
|
|
|
|
require.NoError(t, err)
|
|
|
|
rawexecDriver := driverPlugin.(*rawexec.Driver)
|
|
|
|
|
|
|
|
// Assert the task is still running despite TR having exited
|
|
|
|
taskStatus, err := rawexecDriver.InspectTask(taskID)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, drivers.TaskStateRunning, taskStatus.State)
|
|
|
|
|
|
|
|
// Kill the task so it fails to recover when restore is called
|
|
|
|
require.NoError(t, rawexecDriver.DestroyTask(taskID, true))
|
|
|
|
_, err = rawexecDriver.InspectTask(taskID)
|
|
|
|
require.EqualError(t, err, drivers.ErrTaskNotFound.Error())
|
|
|
|
|
|
|
|
// Create a new TaskRunner and Restore the task
|
2019-05-10 15:54:35 +00:00
|
|
|
conf.ServersContactedCh = make(chan struct{})
|
2019-05-08 06:04:40 +00:00
|
|
|
newTR, err := NewTaskRunner(conf)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
2019-05-10 15:51:06 +00:00
|
|
|
// Assert the TR will wait on servers because reattachment failed
|
|
|
|
require.NoError(t, newTR.Restore())
|
|
|
|
require.True(t, newTR.waitOnServers)
|
2019-05-08 06:04:40 +00:00
|
|
|
|
|
|
|
// Start new TR
|
|
|
|
go newTR.Run()
|
|
|
|
cleanup3 := func() {
|
|
|
|
newTR.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
|
|
|
cleanup2()
|
|
|
|
cleanup1()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Assert task has not been restarted
|
|
|
|
_, err = rawexecDriver.InspectTask(taskID)
|
|
|
|
require.EqualError(t, err, drivers.ErrTaskNotFound.Error())
|
|
|
|
ts := newTR.TaskState()
|
|
|
|
require.Equal(t, structs.TaskStatePending, ts.State)
|
|
|
|
|
2019-05-10 15:51:06 +00:00
|
|
|
return newTR, conf, cleanup3
|
2019-05-08 06:04:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// TestTaskRunner_Restore_Restart asserts restoring a dead task blocks until
|
|
|
|
// MarkAlive is called. #1795
|
|
|
|
func TestTaskRunner_Restore_Restart(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2019-05-10 15:54:35 +00:00
|
|
|
newTR, conf, cleanup := setupRestoreFailureTest(t, mock.Alloc())
|
2019-05-08 06:04:40 +00:00
|
|
|
defer cleanup()
|
|
|
|
|
2019-05-10 15:51:06 +00:00
|
|
|
// Fake contacting the server by closing the chan
|
|
|
|
close(conf.ServersContactedCh)
|
2019-05-08 06:04:40 +00:00
|
|
|
|
|
|
|
testutil.WaitForResult(func() (bool, error) {
|
|
|
|
ts := newTR.TaskState().State
|
|
|
|
return ts == structs.TaskStateRunning, fmt.Errorf("expected task to be running but found %q", ts)
|
|
|
|
}, func(err error) {
|
|
|
|
require.NoError(t, err)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestTaskRunner_Restore_Kill asserts restoring a dead task blocks until
|
|
|
|
// the task is killed. #1795
|
|
|
|
func TestTaskRunner_Restore_Kill(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2019-05-10 15:54:35 +00:00
|
|
|
newTR, _, cleanup := setupRestoreFailureTest(t, mock.Alloc())
|
2019-05-08 06:04:40 +00:00
|
|
|
defer cleanup()
|
|
|
|
|
2019-05-10 15:51:06 +00:00
|
|
|
// Sending the task a terminal update shouldn't kill it or unblock it
|
2019-05-08 06:04:40 +00:00
|
|
|
alloc := newTR.Alloc().Copy()
|
|
|
|
alloc.DesiredStatus = structs.AllocDesiredStatusStop
|
|
|
|
newTR.Update(alloc)
|
|
|
|
|
|
|
|
require.Equal(t, structs.TaskStatePending, newTR.TaskState().State)
|
|
|
|
|
|
|
|
// AllocRunner will immediately kill tasks after sending a terminal
|
|
|
|
// update.
|
|
|
|
newTR.Kill(context.Background(), structs.NewTaskEvent(structs.TaskKilling))
|
|
|
|
|
|
|
|
select {
|
|
|
|
case <-newTR.WaitCh():
|
|
|
|
// It died as expected!
|
|
|
|
case <-time.After(10 * time.Second):
|
|
|
|
require.Fail(t, "timeout waiting for task to die")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestTaskRunner_Restore_Update asserts restoring a dead task blocks until
|
|
|
|
// Update is called. #1795
|
|
|
|
func TestTaskRunner_Restore_Update(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
|
|
|
|
2019-05-10 15:54:35 +00:00
|
|
|
newTR, conf, cleanup := setupRestoreFailureTest(t, mock.Alloc())
|
2019-05-08 06:04:40 +00:00
|
|
|
defer cleanup()
|
|
|
|
|
2019-05-10 15:51:06 +00:00
|
|
|
// Fake Client.runAllocs behavior by calling Update then closing chan
|
2019-05-08 06:04:40 +00:00
|
|
|
alloc := newTR.Alloc().Copy()
|
|
|
|
newTR.Update(alloc)
|
2019-05-10 15:54:35 +00:00
|
|
|
|
|
|
|
// Update alone should not unblock the test
|
|
|
|
require.Equal(t, structs.TaskStatePending, newTR.TaskState().State)
|
|
|
|
|
|
|
|
// Fake Client.runAllocs behavior of closing chan after Update
|
2019-05-10 15:51:06 +00:00
|
|
|
close(conf.ServersContactedCh)
|
2019-05-08 06:04:40 +00:00
|
|
|
|
|
|
|
testutil.WaitForResult(func() (bool, error) {
|
|
|
|
ts := newTR.TaskState().State
|
|
|
|
return ts == structs.TaskStateRunning, fmt.Errorf("expected task to be running but found %q", ts)
|
|
|
|
}, func(err error) {
|
|
|
|
require.NoError(t, err)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2019-05-10 15:54:35 +00:00
|
|
|
// TestTaskRunner_Restore_System asserts restoring a dead system task does not
|
|
|
|
// block.
|
|
|
|
func TestTaskRunner_Restore_System(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-05-10 15:54:35 +00:00
|
|
|
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.Job.Type = structs.JobTypeSystem
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Driver = "raw_exec"
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"command": "sleep",
|
|
|
|
"args": []string{"30"},
|
|
|
|
}
|
cgroups: make sure cgroup still exists after task restart
This PR modifies raw_exec and exec to ensure the cgroup for a task
they are driving still exists during a task restart. These drivers
have the same bug but with different root cause.
For raw_exec, we were removing the cgroup in 2 places - the cpuset
manager, and in the unix containment implementation (the thing that
uses freezer cgroup to clean house). During a task restart, the
containment would remove the cgroup, and when the task runner hooks
went to start again would block on waiting for the cgroup to exist,
which will never happen, because it gets created by the cpuset manager
which only runs as an alloc pre-start hook. The fix here is to simply
not delete the cgroup in the containment implementation; killing the
PIDs is enough. The removal happens in the cpuset manager later anyway.
For exec, it's the same idea, except DestroyTask is called on task
failure, which in turn calls into libcontainer, which in turn deletes
the cgroup. In this case we do not have control over the deletion of
the cgroup, so instead we hack the cgroup back into life after the
call to DestroyTask.
All of this only applies to cgroups v2.
2022-05-04 18:51:53 +00:00
|
|
|
task.Env = map[string]string{
|
|
|
|
"NOMAD_PARENT_CGROUP": "nomad.slice",
|
|
|
|
"NOMAD_ALLOC_ID": alloc.ID,
|
|
|
|
"NOMAD_TASK_NAME": task.Name,
|
|
|
|
}
|
2019-05-10 15:54:35 +00:00
|
|
|
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
conf.StateDB = cstate.NewMemDB(conf.Logger) // "persist" state between runs
|
|
|
|
|
|
|
|
// Run the first TaskRunner
|
|
|
|
origTR, err := NewTaskRunner(conf)
|
|
|
|
require.NoError(t, err)
|
|
|
|
go origTR.Run()
|
|
|
|
defer origTR.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
|
|
|
|
|
|
|
// Wait for it to be running
|
|
|
|
testWaitForTaskToStart(t, origTR)
|
|
|
|
|
|
|
|
handle := origTR.getDriverHandle()
|
|
|
|
require.NotNil(t, handle)
|
|
|
|
taskID := handle.taskID
|
|
|
|
|
|
|
|
// Cause TR to exit without shutting down task
|
|
|
|
origTR.Shutdown()
|
|
|
|
|
|
|
|
// Get the driver
|
|
|
|
driverPlugin, err := conf.DriverManager.Dispense(rawexec.PluginID.Name)
|
|
|
|
require.NoError(t, err)
|
|
|
|
rawexecDriver := driverPlugin.(*rawexec.Driver)
|
|
|
|
|
|
|
|
// Assert the task is still running despite TR having exited
|
|
|
|
taskStatus, err := rawexecDriver.InspectTask(taskID)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, drivers.TaskStateRunning, taskStatus.State)
|
|
|
|
|
|
|
|
// Kill the task so it fails to recover when restore is called
|
|
|
|
require.NoError(t, rawexecDriver.DestroyTask(taskID, true))
|
|
|
|
_, err = rawexecDriver.InspectTask(taskID)
|
|
|
|
require.EqualError(t, err, drivers.ErrTaskNotFound.Error())
|
|
|
|
|
|
|
|
// Create a new TaskRunner and Restore the task
|
|
|
|
conf.ServersContactedCh = make(chan struct{})
|
|
|
|
newTR, err := NewTaskRunner(conf)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
// Assert the TR will not wait on servers even though reattachment
|
|
|
|
// failed because it is a system task.
|
|
|
|
require.NoError(t, newTR.Restore())
|
|
|
|
require.False(t, newTR.waitOnServers)
|
|
|
|
|
|
|
|
// Nothing should have closed the chan
|
|
|
|
select {
|
|
|
|
case <-conf.ServersContactedCh:
|
|
|
|
require.Fail(t, "serversContactedCh was closed but should not have been")
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
|
|
|
|
testutil.WaitForResult(func() (bool, error) {
|
|
|
|
ts := newTR.TaskState().State
|
|
|
|
return ts == structs.TaskStateRunning, fmt.Errorf("expected task to be running but found %q", ts)
|
|
|
|
}, func(err error) {
|
|
|
|
require.NoError(t, err)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2019-03-21 17:57:23 +00:00
|
|
|
// TestTaskRunner_TaskEnv_Interpolated asserts driver configurations are
|
|
|
|
// interpolated.
|
|
|
|
func TestTaskRunner_TaskEnv_Interpolated(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2018-11-06 01:39:02 +00:00
|
|
|
require := require.New(t)
|
|
|
|
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
alloc.Job.TaskGroups[0].Meta = map[string]string{
|
|
|
|
"common_user": "somebody",
|
|
|
|
}
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Meta = map[string]string{
|
|
|
|
"foo": "bar",
|
|
|
|
}
|
|
|
|
|
|
|
|
// Use interpolation from both node attributes and meta vars
|
|
|
|
task.Config = map[string]interface{}{
|
2018-11-16 01:39:45 +00:00
|
|
|
"run_for": "1ms",
|
2018-11-12 18:13:25 +00:00
|
|
|
"stdout_string": `${node.region} ${NOMAD_META_foo} ${NOMAD_META_common_user}`,
|
2018-11-06 01:39:02 +00:00
|
|
|
}
|
|
|
|
|
2019-02-13 16:25:25 +00:00
|
|
|
tr, conf, cleanup := runTestTaskRunner(t, alloc, task.Name)
|
2018-11-06 01:39:02 +00:00
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
// Wait for task to complete
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
|
|
|
case <-time.After(3 * time.Second):
|
2019-03-21 17:57:23 +00:00
|
|
|
require.Fail("timeout waiting for task to exit")
|
2018-11-06 01:39:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Get the mock driver plugin
|
2018-11-28 03:42:22 +00:00
|
|
|
driverPlugin, err := conf.DriverManager.Dispense(mockdriver.PluginID.Name)
|
2018-11-06 01:39:02 +00:00
|
|
|
require.NoError(err)
|
2018-11-28 03:42:22 +00:00
|
|
|
mockDriver := driverPlugin.(*mockdriver.Driver)
|
2018-11-06 01:39:02 +00:00
|
|
|
|
|
|
|
// Assert its config has been properly interpolated
|
|
|
|
driverCfg, mockCfg := mockDriver.GetTaskConfig()
|
|
|
|
require.NotNil(driverCfg)
|
|
|
|
require.NotNil(mockCfg)
|
2018-11-12 18:13:25 +00:00
|
|
|
assert.Equal(t, "global bar somebody", mockCfg.StdoutString)
|
2018-11-06 01:39:02 +00:00
|
|
|
}
|
2018-11-16 23:29:59 +00:00
|
|
|
|
2019-03-21 17:57:23 +00:00
|
|
|
// TestTaskRunner_TaskEnv_Chroot asserts chroot drivers use chroot paths and
|
|
|
|
// not host paths.
|
|
|
|
func TestTaskRunner_TaskEnv_Chroot(t *testing.T) {
|
|
|
|
ctestutil.ExecCompatible(t)
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-03-21 17:57:23 +00:00
|
|
|
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Driver = "exec"
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"command": "bash",
|
|
|
|
"args": []string{"-c", "echo $NOMAD_ALLOC_DIR; " +
|
|
|
|
"echo $NOMAD_TASK_DIR; " +
|
|
|
|
"echo $NOMAD_SECRETS_DIR; " +
|
|
|
|
"echo $PATH; ",
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
// Expect chroot paths and host $PATH
|
|
|
|
exp := fmt.Sprintf(`/alloc
|
|
|
|
/local
|
|
|
|
/secrets
|
|
|
|
%s
|
|
|
|
`, os.Getenv("PATH"))
|
|
|
|
|
|
|
|
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
tr, err := NewTaskRunner(conf)
|
2022-04-19 14:13:38 +00:00
|
|
|
require.NoError(t, err)
|
2019-03-21 17:57:23 +00:00
|
|
|
go tr.Run()
|
|
|
|
defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
|
|
|
|
|
|
|
// Wait for task to exit
|
2022-04-19 14:13:38 +00:00
|
|
|
timeout := 15 * time.Second
|
|
|
|
if testutil.IsCI() {
|
|
|
|
timeout = 120 * time.Second
|
|
|
|
}
|
2019-03-21 17:57:23 +00:00
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
2022-04-19 14:13:38 +00:00
|
|
|
case <-time.After(timeout):
|
|
|
|
require.Fail(t, "timeout waiting for task to exit")
|
2019-03-21 17:57:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Read stdout
|
|
|
|
p := filepath.Join(conf.TaskDir.LogDir, task.Name+".stdout.0")
|
|
|
|
stdout, err := ioutil.ReadFile(p)
|
2022-04-19 14:13:38 +00:00
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equalf(t, exp, string(stdout), "expected: %s\n\nactual: %s\n", exp, stdout)
|
2019-03-21 17:57:23 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// TestTaskRunner_TaskEnv_Image asserts image drivers use chroot paths and
|
|
|
|
// not host paths. Host env vars should also be excluded.
|
|
|
|
func TestTaskRunner_TaskEnv_Image(t *testing.T) {
|
|
|
|
ctestutil.DockerCompatible(t)
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-03-21 17:57:23 +00:00
|
|
|
require := require.New(t)
|
|
|
|
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Driver = "docker"
|
|
|
|
task.Config = map[string]interface{}{
|
2022-05-17 15:24:19 +00:00
|
|
|
"image": "redis:7-alpine",
|
2019-03-21 17:57:23 +00:00
|
|
|
"network_mode": "none",
|
|
|
|
"command": "sh",
|
|
|
|
"args": []string{"-c", "echo $NOMAD_ALLOC_DIR; " +
|
|
|
|
"echo $NOMAD_TASK_DIR; " +
|
|
|
|
"echo $NOMAD_SECRETS_DIR; " +
|
|
|
|
"echo $PATH",
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
// Expect chroot paths and image specific PATH
|
|
|
|
exp := `/alloc
|
|
|
|
/local
|
|
|
|
/secrets
|
|
|
|
/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
|
|
|
`
|
|
|
|
|
|
|
|
tr, conf, cleanup := runTestTaskRunner(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
// Wait for task to exit
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
|
|
|
case <-time.After(15 * time.Second):
|
|
|
|
require.Fail("timeout waiting for task to exit")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Read stdout
|
|
|
|
p := filepath.Join(conf.TaskDir.LogDir, task.Name+".stdout.0")
|
|
|
|
stdout, err := ioutil.ReadFile(p)
|
|
|
|
require.NoError(err)
|
|
|
|
require.Equalf(exp, string(stdout), "expected: %s\n\nactual: %s\n", exp, stdout)
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestTaskRunner_TaskEnv_None asserts raw_exec uses host paths and env vars.
|
|
|
|
func TestTaskRunner_TaskEnv_None(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-03-21 17:57:23 +00:00
|
|
|
require := require.New(t)
|
|
|
|
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Driver = "raw_exec"
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"command": "sh",
|
|
|
|
"args": []string{"-c", "echo $NOMAD_ALLOC_DIR; " +
|
|
|
|
"echo $NOMAD_TASK_DIR; " +
|
|
|
|
"echo $NOMAD_SECRETS_DIR; " +
|
|
|
|
"echo $PATH",
|
|
|
|
},
|
|
|
|
}
|
cgroups: make sure cgroup still exists after task restart
This PR modifies raw_exec and exec to ensure the cgroup for a task
they are driving still exists during a task restart. These drivers
have the same bug but with different root cause.
For raw_exec, we were removing the cgroup in 2 places - the cpuset
manager, and in the unix containment implementation (the thing that
uses freezer cgroup to clean house). During a task restart, the
containment would remove the cgroup, and when the task runner hooks
went to start again would block on waiting for the cgroup to exist,
which will never happen, because it gets created by the cpuset manager
which only runs as an alloc pre-start hook. The fix here is to simply
not delete the cgroup in the containment implementation; killing the
PIDs is enough. The removal happens in the cpuset manager later anyway.
For exec, it's the same idea, except DestroyTask is called on task
failure, which in turn calls into libcontainer, which in turn deletes
the cgroup. In this case we do not have control over the deletion of
the cgroup, so instead we hack the cgroup back into life after the
call to DestroyTask.
All of this only applies to cgroups v2.
2022-05-04 18:51:53 +00:00
|
|
|
task.Env = map[string]string{
|
|
|
|
"NOMAD_PARENT_CGROUP": "nomad.slice",
|
|
|
|
"NOMAD_ALLOC_ID": alloc.ID,
|
|
|
|
"NOMAD_TASK_NAME": task.Name,
|
|
|
|
}
|
2019-03-21 17:57:23 +00:00
|
|
|
tr, conf, cleanup := runTestTaskRunner(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
// Expect host paths
|
|
|
|
root := filepath.Join(conf.ClientConfig.AllocDir, alloc.ID)
|
|
|
|
taskDir := filepath.Join(root, task.Name)
|
|
|
|
exp := fmt.Sprintf(`%s/alloc
|
|
|
|
%s/local
|
|
|
|
%s/secrets
|
|
|
|
%s
|
|
|
|
`, root, taskDir, taskDir, os.Getenv("PATH"))
|
|
|
|
|
|
|
|
// Wait for task to exit
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
|
|
|
case <-time.After(15 * time.Second):
|
|
|
|
require.Fail("timeout waiting for task to exit")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Read stdout
|
|
|
|
p := filepath.Join(conf.TaskDir.LogDir, task.Name+".stdout.0")
|
|
|
|
stdout, err := ioutil.ReadFile(p)
|
|
|
|
require.NoError(err)
|
|
|
|
require.Equalf(exp, string(stdout), "expected: %s\n\nactual: %s\n", exp, stdout)
|
|
|
|
}
|
|
|
|
|
2018-11-16 23:29:59 +00:00
|
|
|
// Test that devices get sent to the driver
|
|
|
|
func TestTaskRunner_DevicePropogation(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2018-11-16 23:29:59 +00:00
|
|
|
require := require.New(t)
|
|
|
|
|
|
|
|
// Create a mock alloc that has a gpu
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
alloc.Job.TaskGroups[0].Count = 1
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Driver = "mock_driver"
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"run_for": "100ms",
|
|
|
|
}
|
|
|
|
tRes := alloc.AllocatedResources.Tasks[task.Name]
|
|
|
|
tRes.Devices = append(tRes.Devices, &structs.AllocatedDeviceResource{Type: "mock"})
|
|
|
|
|
|
|
|
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
2019-03-01 23:02:53 +00:00
|
|
|
conf.StateDB = cstate.NewMemDB(conf.Logger) // "persist" state between task runners
|
2018-11-16 23:29:59 +00:00
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
// Setup the devicemanager
|
|
|
|
dm, ok := conf.DeviceManager.(*devicemanager.MockManager)
|
|
|
|
require.True(ok)
|
|
|
|
|
|
|
|
dm.ReserveF = func(d *structs.AllocatedDeviceResource) (*device.ContainerReservation, error) {
|
|
|
|
res := &device.ContainerReservation{
|
|
|
|
Envs: map[string]string{
|
2018-11-27 19:53:47 +00:00
|
|
|
"ABC": "123",
|
2018-11-16 23:29:59 +00:00
|
|
|
},
|
|
|
|
Mounts: []*device.Mount{
|
|
|
|
{
|
|
|
|
ReadOnly: true,
|
|
|
|
TaskPath: "foo",
|
|
|
|
HostPath: "bar",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
Devices: []*device.DeviceSpec{
|
|
|
|
{
|
|
|
|
TaskPath: "foo",
|
|
|
|
HostPath: "bar",
|
|
|
|
CgroupPerms: "123",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
return res, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Run the TaskRunner
|
|
|
|
tr, err := NewTaskRunner(conf)
|
|
|
|
require.NoError(err)
|
|
|
|
go tr.Run()
|
|
|
|
defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
|
|
|
|
|
|
|
// Wait for task to complete
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
|
|
|
case <-time.After(3 * time.Second):
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get the mock driver plugin
|
2018-12-08 06:04:52 +00:00
|
|
|
driverPlugin, err := conf.DriverManager.Dispense(mockdriver.PluginID.Name)
|
2018-11-16 23:29:59 +00:00
|
|
|
require.NoError(err)
|
2018-12-08 06:04:52 +00:00
|
|
|
mockDriver := driverPlugin.(*mockdriver.Driver)
|
2018-11-16 23:29:59 +00:00
|
|
|
|
|
|
|
// Assert its config has been properly interpolated
|
|
|
|
driverCfg, _ := mockDriver.GetTaskConfig()
|
|
|
|
require.NotNil(driverCfg)
|
|
|
|
require.Len(driverCfg.Devices, 1)
|
|
|
|
require.Equal(driverCfg.Devices[0].Permissions, "123")
|
|
|
|
require.Len(driverCfg.Mounts, 1)
|
|
|
|
require.Equal(driverCfg.Mounts[0].TaskPath, "foo")
|
2018-11-27 19:53:47 +00:00
|
|
|
require.Contains(driverCfg.Env, "ABC")
|
|
|
|
}
|
|
|
|
|
|
|
|
// mockEnvHook is a test hook that sets an env var and done=true. It fails if
|
|
|
|
// it's called more than once.
|
|
|
|
type mockEnvHook struct {
|
|
|
|
called int
|
|
|
|
}
|
|
|
|
|
|
|
|
func (*mockEnvHook) Name() string {
|
|
|
|
return "mock_env_hook"
|
|
|
|
}
|
|
|
|
|
|
|
|
func (h *mockEnvHook) Prestart(ctx context.Context, req *interfaces.TaskPrestartRequest, resp *interfaces.TaskPrestartResponse) error {
|
|
|
|
h.called++
|
|
|
|
|
|
|
|
resp.Done = true
|
|
|
|
resp.Env = map[string]string{
|
|
|
|
"mock_hook": "1",
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestTaskRunner_Restore_HookEnv asserts that re-running prestart hooks with
|
|
|
|
// hook environments set restores the environment without re-running done
|
|
|
|
// hooks.
|
|
|
|
func TestTaskRunner_Restore_HookEnv(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2018-11-27 19:53:47 +00:00
|
|
|
require := require.New(t)
|
|
|
|
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
2019-03-01 23:02:53 +00:00
|
|
|
conf.StateDB = cstate.NewMemDB(conf.Logger) // "persist" state between prestart calls
|
2018-11-27 19:53:47 +00:00
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
tr, err := NewTaskRunner(conf)
|
|
|
|
require.NoError(err)
|
|
|
|
|
|
|
|
// Override the default hooks to only run the mock hook
|
|
|
|
mockHook := &mockEnvHook{}
|
|
|
|
tr.runnerHooks = []interfaces.TaskHook{mockHook}
|
|
|
|
|
|
|
|
// Manually run prestart hooks
|
|
|
|
require.NoError(tr.prestart())
|
|
|
|
|
|
|
|
// Assert env was called
|
|
|
|
require.Equal(1, mockHook.called)
|
|
|
|
|
|
|
|
// Re-running prestart hooks should *not* call done mock hook
|
|
|
|
require.NoError(tr.prestart())
|
|
|
|
|
|
|
|
// Assert env was called
|
|
|
|
require.Equal(1, mockHook.called)
|
|
|
|
|
|
|
|
// Assert the env is still set
|
|
|
|
env := tr.envBuilder.Build().All()
|
|
|
|
require.Contains(env, "mock_hook")
|
|
|
|
require.Equal("1", env["mock_hook"])
|
2018-11-16 23:29:59 +00:00
|
|
|
}
|
2019-01-05 00:08:47 +00:00
|
|
|
|
|
|
|
// This test asserts that we can recover from an "external" plugin exiting by
|
|
|
|
// retrieving a new instance of the driver and recovering the task.
|
|
|
|
func TestTaskRunner_RecoverFromDriverExiting(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-01-05 00:08:47 +00:00
|
|
|
require := require.New(t)
|
|
|
|
|
|
|
|
// Create an allocation using the mock driver that exits simulating the
|
|
|
|
// driver crashing. We can then test that the task runner recovers from this
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Driver = "mock_driver"
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"plugin_exit_after": "1s",
|
|
|
|
"run_for": "5s",
|
|
|
|
}
|
|
|
|
|
|
|
|
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
2019-03-01 23:02:53 +00:00
|
|
|
conf.StateDB = cstate.NewMemDB(conf.Logger) // "persist" state between prestart calls
|
2019-01-05 00:08:47 +00:00
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
tr, err := NewTaskRunner(conf)
|
|
|
|
require.NoError(err)
|
|
|
|
|
|
|
|
start := time.Now()
|
|
|
|
go tr.Run()
|
|
|
|
defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
|
|
|
|
|
|
|
// Wait for the task to be running
|
|
|
|
testWaitForTaskToStart(t, tr)
|
|
|
|
|
|
|
|
// Get the task ID
|
|
|
|
tr.stateLock.RLock()
|
|
|
|
l := tr.localState.TaskHandle
|
|
|
|
require.NotNil(l)
|
|
|
|
require.NotNil(l.Config)
|
|
|
|
require.NotEmpty(l.Config.ID)
|
|
|
|
id := l.Config.ID
|
|
|
|
tr.stateLock.RUnlock()
|
|
|
|
|
|
|
|
// Get the mock driver plugin
|
|
|
|
driverPlugin, err := conf.DriverManager.Dispense(mockdriver.PluginID.Name)
|
|
|
|
require.NoError(err)
|
|
|
|
mockDriver := driverPlugin.(*mockdriver.Driver)
|
|
|
|
|
|
|
|
// Wait for the task to start
|
|
|
|
testutil.WaitForResult(func() (bool, error) {
|
|
|
|
// Get the handle and check that it was recovered
|
|
|
|
handle := mockDriver.GetHandle(id)
|
|
|
|
if handle == nil {
|
|
|
|
return false, fmt.Errorf("nil handle")
|
|
|
|
}
|
|
|
|
if !handle.Recovered {
|
|
|
|
return false, fmt.Errorf("handle not recovered")
|
|
|
|
}
|
|
|
|
return true, nil
|
|
|
|
}, func(err error) {
|
|
|
|
t.Fatal(err.Error())
|
|
|
|
})
|
|
|
|
|
|
|
|
// Wait for task to complete
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
|
|
|
case <-time.After(10 * time.Second):
|
|
|
|
}
|
|
|
|
|
|
|
|
// Ensure that we actually let the task complete
|
|
|
|
require.True(time.Now().Sub(start) > 5*time.Second)
|
|
|
|
|
|
|
|
// Check it finished successfully
|
|
|
|
state := tr.TaskState()
|
|
|
|
require.True(state.Successful())
|
|
|
|
}
|
|
|
|
|
2019-01-09 21:35:16 +00:00
|
|
|
// TestTaskRunner_ShutdownDelay asserts services are removed from Consul
|
|
|
|
// ${shutdown_delay} seconds before killing the process.
|
|
|
|
func TestTaskRunner_ShutdownDelay(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-01-09 21:35:16 +00:00
|
|
|
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Services[0].Tags = []string{"tag1"}
|
|
|
|
task.Services = task.Services[:1] // only need 1 for this test
|
|
|
|
task.Driver = "mock_driver"
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"run_for": "1000s",
|
|
|
|
}
|
|
|
|
|
|
|
|
// No shutdown escape hatch for this delay, so don't set it too high
|
2019-02-12 23:28:13 +00:00
|
|
|
task.ShutdownDelay = 1000 * time.Duration(testutil.TestMultiplier()) * time.Millisecond
|
2019-01-09 21:35:16 +00:00
|
|
|
|
2019-02-13 16:25:25 +00:00
|
|
|
tr, conf, cleanup := runTestTaskRunner(t, alloc, task.Name)
|
2019-01-09 21:35:16 +00:00
|
|
|
defer cleanup()
|
|
|
|
|
2022-03-15 08:38:30 +00:00
|
|
|
mockConsul := conf.Consul.(*regMock.ServiceRegistrationHandler)
|
2019-01-09 21:35:16 +00:00
|
|
|
|
|
|
|
// Wait for the task to start
|
|
|
|
testWaitForTaskToStart(t, tr)
|
|
|
|
|
|
|
|
testutil.WaitForResult(func() (bool, error) {
|
|
|
|
ops := mockConsul.GetOps()
|
|
|
|
if n := len(ops); n != 1 {
|
|
|
|
return false, fmt.Errorf("expected 1 consul operation. Found %d", n)
|
|
|
|
}
|
|
|
|
return ops[0].Op == "add", fmt.Errorf("consul operation was not a registration: %#v", ops[0])
|
|
|
|
}, func(err error) {
|
|
|
|
t.Fatalf("err: %v", err)
|
|
|
|
})
|
|
|
|
|
|
|
|
// Asynchronously kill task
|
|
|
|
killSent := time.Now()
|
|
|
|
killed := make(chan struct{})
|
|
|
|
go func() {
|
|
|
|
defer close(killed)
|
|
|
|
assert.NoError(t, tr.Kill(context.Background(), structs.NewTaskEvent("test")))
|
|
|
|
}()
|
|
|
|
|
2021-07-06 14:37:53 +00:00
|
|
|
// Wait for *1* de-registration calls (all [non-]canary variants removed).
|
|
|
|
|
2019-01-09 21:35:16 +00:00
|
|
|
WAIT:
|
|
|
|
for {
|
|
|
|
ops := mockConsul.GetOps()
|
|
|
|
switch n := len(ops); n {
|
2021-07-06 14:37:53 +00:00
|
|
|
case 1:
|
|
|
|
// Waiting for single de-registration call.
|
|
|
|
case 2:
|
2019-01-09 21:35:16 +00:00
|
|
|
require.Equalf(t, "remove", ops[1].Op, "expected deregistration but found: %#v", ops[1])
|
|
|
|
break WAIT
|
|
|
|
default:
|
|
|
|
// ?!
|
|
|
|
t.Fatalf("unexpected number of consul operations: %d\n%s", n, pretty.Sprint(ops))
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
select {
|
|
|
|
case <-killed:
|
|
|
|
t.Fatal("killed while service still registered")
|
|
|
|
case <-time.After(10 * time.Millisecond):
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Wait for actual exit
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
|
|
|
case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second):
|
|
|
|
t.Fatalf("timeout")
|
|
|
|
}
|
|
|
|
|
|
|
|
<-killed
|
|
|
|
killDur := time.Now().Sub(killSent)
|
|
|
|
if killDur < task.ShutdownDelay {
|
|
|
|
t.Fatalf("task killed before shutdown_delay (killed_after: %s; shutdown_delay: %s",
|
|
|
|
killDur, task.ShutdownDelay,
|
|
|
|
)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-12-13 19:54:53 +00:00
|
|
|
// TestTaskRunner_NoShutdownDelay asserts services are removed from
|
|
|
|
// Consul and tasks are killed without waiting for ${shutdown_delay}
|
|
|
|
// when the alloc has the NoShutdownDelay transition flag set.
|
|
|
|
func TestTaskRunner_NoShutdownDelay(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2021-12-13 19:54:53 +00:00
|
|
|
|
|
|
|
// don't set this too high so that we don't block the test runner
|
|
|
|
// on shutting down the agent if the test fails
|
|
|
|
maxTestDuration := time.Duration(testutil.TestMultiplier()*10) * time.Second
|
|
|
|
maxTimeToFailDuration := time.Duration(testutil.TestMultiplier()) * time.Second
|
|
|
|
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
alloc.DesiredTransition = structs.DesiredTransition{NoShutdownDelay: helper.BoolToPtr(true)}
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Services[0].Tags = []string{"tag1"}
|
|
|
|
task.Services = task.Services[:1] // only need 1 for this test
|
|
|
|
task.Driver = "mock_driver"
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"run_for": "1000s",
|
|
|
|
}
|
|
|
|
task.ShutdownDelay = maxTestDuration
|
|
|
|
|
|
|
|
tr, conf, cleanup := runTestTaskRunner(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
|
2022-03-15 08:38:30 +00:00
|
|
|
mockConsul := conf.Consul.(*regMock.ServiceRegistrationHandler)
|
2021-12-13 19:54:53 +00:00
|
|
|
|
|
|
|
testWaitForTaskToStart(t, tr)
|
|
|
|
|
|
|
|
testutil.WaitForResult(func() (bool, error) {
|
|
|
|
ops := mockConsul.GetOps()
|
|
|
|
if n := len(ops); n != 1 {
|
|
|
|
return false, fmt.Errorf("expected 1 consul operation. Found %d", n)
|
|
|
|
}
|
|
|
|
return ops[0].Op == "add", fmt.Errorf("consul operation was not a registration: %#v", ops[0])
|
|
|
|
}, func(err error) {
|
|
|
|
t.Fatalf("err: %v", err)
|
|
|
|
})
|
|
|
|
|
|
|
|
testCtx, cancel := context.WithTimeout(context.Background(), maxTimeToFailDuration)
|
|
|
|
defer cancel()
|
|
|
|
|
|
|
|
killed := make(chan error)
|
|
|
|
go func() {
|
|
|
|
tr.shutdownDelayCancel()
|
|
|
|
err := tr.Kill(testCtx, structs.NewTaskEvent("test"))
|
|
|
|
killed <- err
|
|
|
|
}()
|
|
|
|
|
|
|
|
// Wait for first de-registration call. Note that unlike
|
|
|
|
// TestTaskRunner_ShutdownDelay, we're racing with task exit
|
|
|
|
// and can't assert that we only get the first deregistration op
|
|
|
|
// (from serviceHook.PreKill).
|
|
|
|
testutil.WaitForResult(func() (bool, error) {
|
|
|
|
ops := mockConsul.GetOps()
|
|
|
|
if n := len(ops); n < 2 {
|
|
|
|
return false, fmt.Errorf("expected at least 2 consul operations.")
|
|
|
|
}
|
|
|
|
return ops[1].Op == "remove", fmt.Errorf(
|
|
|
|
"consul operation was not a deregistration: %#v", ops[1])
|
|
|
|
}, func(err error) {
|
|
|
|
t.Fatalf("err: %v", err)
|
|
|
|
})
|
|
|
|
|
|
|
|
// Wait for the task to exit
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
|
|
|
case <-time.After(maxTimeToFailDuration):
|
|
|
|
t.Fatalf("task kill did not ignore shutdown delay")
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
err := <-killed
|
|
|
|
require.NoError(t, err, "killing task returned unexpected error")
|
|
|
|
}
|
|
|
|
|
2019-01-15 23:19:51 +00:00
|
|
|
// TestTaskRunner_Dispatch_Payload asserts that a dispatch job runs and the
|
|
|
|
// payload was written to disk.
|
|
|
|
func TestTaskRunner_Dispatch_Payload(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-01-15 23:19:51 +00:00
|
|
|
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Driver = "mock_driver"
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"run_for": "1s",
|
|
|
|
}
|
|
|
|
|
|
|
|
fileName := "test"
|
|
|
|
task.DispatchPayload = &structs.DispatchPayloadConfig{
|
|
|
|
File: fileName,
|
|
|
|
}
|
|
|
|
alloc.Job.ParameterizedJob = &structs.ParameterizedJobConfig{}
|
|
|
|
|
|
|
|
// Add a payload (they're snappy encoded bytes)
|
|
|
|
expected := []byte("hello world")
|
|
|
|
compressed := snappy.Encode(nil, expected)
|
|
|
|
alloc.Job.Payload = compressed
|
|
|
|
|
2019-02-13 16:25:25 +00:00
|
|
|
tr, _, cleanup := runTestTaskRunner(t, alloc, task.Name)
|
2019-01-15 23:19:51 +00:00
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
// Wait for it to finish
|
|
|
|
testutil.WaitForResult(func() (bool, error) {
|
|
|
|
ts := tr.TaskState()
|
|
|
|
return ts.State == structs.TaskStateDead, fmt.Errorf("%v", ts.State)
|
|
|
|
}, func(err error) {
|
|
|
|
require.NoError(t, err)
|
|
|
|
})
|
|
|
|
|
|
|
|
// Should have exited successfully
|
|
|
|
ts := tr.TaskState()
|
|
|
|
require.False(t, ts.Failed)
|
|
|
|
require.Zero(t, ts.Restarts)
|
|
|
|
|
|
|
|
// Check that the file was written to disk properly
|
|
|
|
payloadPath := filepath.Join(tr.taskDir.LocalDir, fileName)
|
|
|
|
data, err := ioutil.ReadFile(payloadPath)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, expected, data)
|
|
|
|
}
|
|
|
|
|
2019-01-18 15:18:00 +00:00
|
|
|
// TestTaskRunner_SignalFailure asserts that signal errors are properly
|
|
|
|
// propagated from the driver to TaskRunner.
|
2019-01-17 23:01:49 +00:00
|
|
|
func TestTaskRunner_SignalFailure(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-01-17 23:01:49 +00:00
|
|
|
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Driver = "mock_driver"
|
|
|
|
errMsg := "test forcing failure"
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"run_for": "10m",
|
|
|
|
"signal_error": errMsg,
|
|
|
|
}
|
|
|
|
|
2019-02-13 16:25:25 +00:00
|
|
|
tr, _, cleanup := runTestTaskRunner(t, alloc, task.Name)
|
2019-01-17 23:01:49 +00:00
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
testWaitForTaskToStart(t, tr)
|
|
|
|
|
test: port TestTaskRunner_CheckWatcher_Restart
Added ability to adjust the number of events the TaskRunner keeps as
there's no way to observe all events otherwise.
Task events differ slightly from 0.8 because 0.9 emits Terminated every
time a task exits instead of only when it exits on its own (not due to
restart or kill).
0.9 does not emit Killing/Killed for restarts like 0.8 which seems fine
as `Restart Signaled/Terminated/Restarting` is more descriptive.
Original v0.8 events emitted:
```
expected := []string{
"Received",
"Task Setup",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Restarting",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Restarting",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Not Restarting",
}
```
2019-01-18 16:30:44 +00:00
|
|
|
require.EqualError(t, tr.Signal(&structs.TaskEvent{}, "SIGINT"), errMsg)
|
2019-01-17 23:01:49 +00:00
|
|
|
}
|
|
|
|
|
2019-01-18 15:18:00 +00:00
|
|
|
// TestTaskRunner_RestartTask asserts that restarting a task works and emits a
|
|
|
|
// Restarting event.
|
|
|
|
func TestTaskRunner_RestartTask(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-01-18 15:18:00 +00:00
|
|
|
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Driver = "mock_driver"
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"run_for": "10m",
|
|
|
|
}
|
|
|
|
|
2019-02-13 16:25:25 +00:00
|
|
|
tr, _, cleanup := runTestTaskRunner(t, alloc, task.Name)
|
2019-01-18 15:18:00 +00:00
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
testWaitForTaskToStart(t, tr)
|
|
|
|
|
|
|
|
// Restart task. Send a RestartSignal event like check watcher. Restart
|
|
|
|
// handler emits the Restarting event.
|
|
|
|
event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason("test")
|
|
|
|
const fail = false
|
|
|
|
tr.Restart(context.Background(), event.Copy(), fail)
|
|
|
|
|
|
|
|
// Wait for it to restart and be running again
|
|
|
|
testutil.WaitForResult(func() (bool, error) {
|
|
|
|
ts := tr.TaskState()
|
|
|
|
if ts.Restarts != 1 {
|
|
|
|
return false, fmt.Errorf("expected 1 restart but found %d\nevents: %s",
|
|
|
|
ts.Restarts, pretty.Sprint(ts.Events))
|
|
|
|
}
|
|
|
|
if ts.State != structs.TaskStateRunning {
|
|
|
|
return false, fmt.Errorf("expected running but received %s", ts.State)
|
|
|
|
}
|
|
|
|
return true, nil
|
|
|
|
}, func(err error) {
|
|
|
|
require.NoError(t, err)
|
|
|
|
})
|
|
|
|
|
|
|
|
// Assert the expected Restarting event was emitted
|
|
|
|
found := false
|
|
|
|
events := tr.TaskState().Events
|
|
|
|
for _, e := range events {
|
|
|
|
if e.Type == structs.TaskRestartSignal {
|
|
|
|
found = true
|
|
|
|
require.Equal(t, event.Time, e.Time)
|
|
|
|
require.Equal(t, event.RestartReason, e.RestartReason)
|
|
|
|
require.Contains(t, e.DisplayMessage, event.RestartReason)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
require.True(t, found, "restarting task event not found", pretty.Sprint(events))
|
|
|
|
}
|
|
|
|
|
test: port TestTaskRunner_CheckWatcher_Restart
Added ability to adjust the number of events the TaskRunner keeps as
there's no way to observe all events otherwise.
Task events differ slightly from 0.8 because 0.9 emits Terminated every
time a task exits instead of only when it exits on its own (not due to
restart or kill).
0.9 does not emit Killing/Killed for restarts like 0.8 which seems fine
as `Restart Signaled/Terminated/Restarting` is more descriptive.
Original v0.8 events emitted:
```
expected := []string{
"Received",
"Task Setup",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Restarting",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Restarting",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Not Restarting",
}
```
2019-01-18 16:30:44 +00:00
|
|
|
// TestTaskRunner_CheckWatcher_Restart asserts that when enabled an unhealthy
|
|
|
|
// Consul check will cause a task to restart following restart policy rules.
|
|
|
|
func TestTaskRunner_CheckWatcher_Restart(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
test: port TestTaskRunner_CheckWatcher_Restart
Added ability to adjust the number of events the TaskRunner keeps as
there's no way to observe all events otherwise.
Task events differ slightly from 0.8 because 0.9 emits Terminated every
time a task exits instead of only when it exits on its own (not due to
restart or kill).
0.9 does not emit Killing/Killed for restarts like 0.8 which seems fine
as `Restart Signaled/Terminated/Restarting` is more descriptive.
Original v0.8 events emitted:
```
expected := []string{
"Received",
"Task Setup",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Restarting",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Restarting",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Not Restarting",
}
```
2019-01-18 16:30:44 +00:00
|
|
|
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
|
|
|
|
// Make the restart policy fail within this test
|
|
|
|
tg := alloc.Job.TaskGroups[0]
|
|
|
|
tg.RestartPolicy.Attempts = 2
|
|
|
|
tg.RestartPolicy.Interval = 1 * time.Minute
|
|
|
|
tg.RestartPolicy.Delay = 10 * time.Millisecond
|
|
|
|
tg.RestartPolicy.Mode = structs.RestartPolicyModeFail
|
|
|
|
|
|
|
|
task := tg.Tasks[0]
|
|
|
|
task.Driver = "mock_driver"
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"run_for": "10m",
|
|
|
|
}
|
|
|
|
|
|
|
|
// Make the task register a check that fails
|
|
|
|
task.Services[0].Checks[0] = &structs.ServiceCheck{
|
|
|
|
Name: "test-restarts",
|
|
|
|
Type: structs.ServiceCheckTCP,
|
|
|
|
Interval: 50 * time.Millisecond,
|
|
|
|
CheckRestart: &structs.CheckRestart{
|
|
|
|
Limit: 2,
|
|
|
|
Grace: 100 * time.Millisecond,
|
|
|
|
},
|
|
|
|
}
|
2022-03-21 09:29:57 +00:00
|
|
|
task.Services[0].Provider = structs.ServiceProviderConsul
|
test: port TestTaskRunner_CheckWatcher_Restart
Added ability to adjust the number of events the TaskRunner keeps as
there's no way to observe all events otherwise.
Task events differ slightly from 0.8 because 0.9 emits Terminated every
time a task exits instead of only when it exits on its own (not due to
restart or kill).
0.9 does not emit Killing/Killed for restarts like 0.8 which seems fine
as `Restart Signaled/Terminated/Restarting` is more descriptive.
Original v0.8 events emitted:
```
expected := []string{
"Received",
"Task Setup",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Restarting",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Restarting",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Not Restarting",
}
```
2019-01-18 16:30:44 +00:00
|
|
|
|
|
|
|
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
// Replace mock Consul ServiceClient, with the real ServiceClient
|
|
|
|
// backed by a mock consul whose checks are always unhealthy.
|
2021-06-07 15:54:33 +00:00
|
|
|
consulAgent := agentconsul.NewMockAgent(agentconsul.Features{
|
|
|
|
Enterprise: false,
|
|
|
|
Namespaces: false,
|
|
|
|
})
|
test: port TestTaskRunner_CheckWatcher_Restart
Added ability to adjust the number of events the TaskRunner keeps as
there's no way to observe all events otherwise.
Task events differ slightly from 0.8 because 0.9 emits Terminated every
time a task exits instead of only when it exits on its own (not due to
restart or kill).
0.9 does not emit Killing/Killed for restarts like 0.8 which seems fine
as `Restart Signaled/Terminated/Restarting` is more descriptive.
Original v0.8 events emitted:
```
expected := []string{
"Received",
"Task Setup",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Restarting",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Restarting",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Not Restarting",
}
```
2019-01-18 16:30:44 +00:00
|
|
|
consulAgent.SetStatus("critical")
|
2021-06-07 15:54:33 +00:00
|
|
|
namespacesClient := agentconsul.NewNamespacesClient(agentconsul.NewMockNamespaces(nil), consulAgent)
|
2021-03-16 18:22:21 +00:00
|
|
|
consulClient := agentconsul.NewServiceClient(consulAgent, namespacesClient, conf.Logger, true)
|
test: port TestTaskRunner_CheckWatcher_Restart
Added ability to adjust the number of events the TaskRunner keeps as
there's no way to observe all events otherwise.
Task events differ slightly from 0.8 because 0.9 emits Terminated every
time a task exits instead of only when it exits on its own (not due to
restart or kill).
0.9 does not emit Killing/Killed for restarts like 0.8 which seems fine
as `Restart Signaled/Terminated/Restarting` is more descriptive.
Original v0.8 events emitted:
```
expected := []string{
"Received",
"Task Setup",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Restarting",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Restarting",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Not Restarting",
}
```
2019-01-18 16:30:44 +00:00
|
|
|
go consulClient.Run()
|
|
|
|
defer consulClient.Shutdown()
|
|
|
|
|
|
|
|
conf.Consul = consulClient
|
2022-03-21 09:29:57 +00:00
|
|
|
conf.ServiceRegWrapper = wrapper.NewHandlerWrapper(conf.Logger, consulClient, nil)
|
test: port TestTaskRunner_CheckWatcher_Restart
Added ability to adjust the number of events the TaskRunner keeps as
there's no way to observe all events otherwise.
Task events differ slightly from 0.8 because 0.9 emits Terminated every
time a task exits instead of only when it exits on its own (not due to
restart or kill).
0.9 does not emit Killing/Killed for restarts like 0.8 which seems fine
as `Restart Signaled/Terminated/Restarting` is more descriptive.
Original v0.8 events emitted:
```
expected := []string{
"Received",
"Task Setup",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Restarting",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Restarting",
"Started",
"Restart Signaled",
"Killing",
"Killed",
"Not Restarting",
}
```
2019-01-18 16:30:44 +00:00
|
|
|
|
|
|
|
tr, err := NewTaskRunner(conf)
|
|
|
|
require.NoError(t, err)
|
|
|
|
|
|
|
|
expectedEvents := []string{
|
|
|
|
"Received",
|
|
|
|
"Task Setup",
|
|
|
|
"Started",
|
|
|
|
"Restart Signaled",
|
|
|
|
"Terminated",
|
|
|
|
"Restarting",
|
|
|
|
"Started",
|
|
|
|
"Restart Signaled",
|
|
|
|
"Terminated",
|
|
|
|
"Restarting",
|
|
|
|
"Started",
|
|
|
|
"Restart Signaled",
|
|
|
|
"Terminated",
|
|
|
|
"Not Restarting",
|
|
|
|
}
|
|
|
|
|
|
|
|
// Bump maxEvents so task events aren't dropped
|
|
|
|
tr.maxEvents = 100
|
|
|
|
|
|
|
|
go tr.Run()
|
|
|
|
defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
|
|
|
|
|
|
|
// Wait until the task exits. Don't simply wait for it to run as it may
|
|
|
|
// get restarted and terminated before the test is able to observe it
|
|
|
|
// running.
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
|
|
|
case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second):
|
|
|
|
require.Fail(t, "timeout")
|
|
|
|
}
|
|
|
|
|
|
|
|
state := tr.TaskState()
|
|
|
|
actualEvents := make([]string, len(state.Events))
|
|
|
|
for i, e := range state.Events {
|
|
|
|
actualEvents[i] = string(e.Type)
|
|
|
|
}
|
|
|
|
require.Equal(t, actualEvents, expectedEvents)
|
|
|
|
require.Equal(t, structs.TaskStateDead, state.State)
|
|
|
|
require.True(t, state.Failed, pretty.Sprint(state))
|
|
|
|
}
|
|
|
|
|
2019-12-18 16:23:16 +00:00
|
|
|
type mockEnvoyBootstrapHook struct {
|
|
|
|
// nothing
|
|
|
|
}
|
2019-12-06 20:46:46 +00:00
|
|
|
|
2019-12-18 16:23:16 +00:00
|
|
|
func (_ *mockEnvoyBootstrapHook) Name() string {
|
2019-12-06 20:46:46 +00:00
|
|
|
return "mock_envoy_bootstrap"
|
|
|
|
}
|
|
|
|
|
2020-01-28 22:33:59 +00:00
|
|
|
func (_ *mockEnvoyBootstrapHook) Prestart(_ context.Context, _ *interfaces.TaskPrestartRequest, resp *interfaces.TaskPrestartResponse) error {
|
2019-12-06 20:46:46 +00:00
|
|
|
resp.Done = true
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// The envoy bootstrap hook tries to connect to consul and run the envoy
|
|
|
|
// bootstrap command, so turn it off when testing connect jobs that are not
|
2019-12-18 16:23:16 +00:00
|
|
|
// using envoy.
|
|
|
|
func useMockEnvoyBootstrapHook(tr *TaskRunner) {
|
|
|
|
mock := new(mockEnvoyBootstrapHook)
|
2019-12-06 20:46:46 +00:00
|
|
|
for i, hook := range tr.runnerHooks {
|
|
|
|
if _, ok := hook.(*envoyBootstrapHook); ok {
|
2019-12-18 16:23:16 +00:00
|
|
|
tr.runnerHooks[i] = mock
|
2019-12-06 20:46:46 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestTaskRunner_BlockForSIDSToken asserts tasks do not start until a Consul
|
|
|
|
// Service Identity token is derived.
|
|
|
|
func TestTaskRunner_BlockForSIDSToken(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-11-27 21:41:45 +00:00
|
|
|
r := require.New(t)
|
|
|
|
|
2019-12-06 20:46:46 +00:00
|
|
|
alloc := mock.BatchConnectAlloc()
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"run_for": "0s",
|
2019-11-27 21:41:45 +00:00
|
|
|
}
|
|
|
|
|
2019-12-06 20:46:46 +00:00
|
|
|
trConfig, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
2019-11-27 21:41:45 +00:00
|
|
|
defer cleanup()
|
|
|
|
|
2020-01-28 22:33:59 +00:00
|
|
|
// set a consul token on the Nomad client's consul config, because that is
|
|
|
|
// what gates the action of requesting SI token(s)
|
|
|
|
trConfig.ClientConfig.ConsulConfig.Token = uuid.Generate()
|
|
|
|
|
2019-11-27 21:41:45 +00:00
|
|
|
// control when we get a Consul SI token
|
2020-01-15 15:29:47 +00:00
|
|
|
token := uuid.Generate()
|
2019-11-27 21:41:45 +00:00
|
|
|
waitCh := make(chan struct{})
|
|
|
|
deriveFn := func(*structs.Allocation, []string) (map[string]string, error) {
|
|
|
|
<-waitCh
|
2019-12-06 20:46:46 +00:00
|
|
|
return map[string]string{task.Name: token}, nil
|
2019-11-27 21:41:45 +00:00
|
|
|
}
|
|
|
|
siClient := trConfig.ConsulSI.(*consulapi.MockServiceIdentitiesClient)
|
|
|
|
siClient.DeriveTokenFn = deriveFn
|
|
|
|
|
|
|
|
// start the task runner
|
|
|
|
tr, err := NewTaskRunner(trConfig)
|
|
|
|
r.NoError(err)
|
|
|
|
defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
2019-12-18 16:23:16 +00:00
|
|
|
useMockEnvoyBootstrapHook(tr) // mock the envoy bootstrap hook
|
|
|
|
|
2019-11-27 21:41:45 +00:00
|
|
|
go tr.Run()
|
|
|
|
|
|
|
|
// assert task runner blocks on SI token
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
|
|
|
r.Fail("task_runner exited before si unblocked")
|
|
|
|
case <-time.After(100 * time.Millisecond):
|
|
|
|
}
|
|
|
|
|
|
|
|
// assert task state is still pending
|
|
|
|
r.Equal(structs.TaskStatePending, tr.TaskState().State)
|
|
|
|
|
|
|
|
// unblock service identity token
|
|
|
|
close(waitCh)
|
|
|
|
|
|
|
|
// task runner should exit now that it has been unblocked and it is a batch
|
|
|
|
// job with a zero sleep time
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
|
|
|
case <-time.After(15 * time.Second * time.Duration(testutil.TestMultiplier())):
|
|
|
|
r.Fail("timed out waiting for batch task to exist")
|
|
|
|
}
|
|
|
|
|
|
|
|
// assert task exited successfully
|
|
|
|
finalState := tr.TaskState()
|
|
|
|
r.Equal(structs.TaskStateDead, finalState.State)
|
|
|
|
r.False(finalState.Failed)
|
|
|
|
|
|
|
|
// assert the token is on disk
|
|
|
|
tokenPath := filepath.Join(trConfig.TaskDir.SecretsDir, sidsTokenFile)
|
|
|
|
data, err := ioutil.ReadFile(tokenPath)
|
|
|
|
r.NoError(err)
|
|
|
|
r.Equal(token, string(data))
|
|
|
|
}
|
|
|
|
|
2019-12-06 20:46:46 +00:00
|
|
|
func TestTaskRunner_DeriveSIToken_Retry(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-12-06 20:46:46 +00:00
|
|
|
r := require.New(t)
|
|
|
|
|
|
|
|
alloc := mock.BatchConnectAlloc()
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"run_for": "0s",
|
|
|
|
}
|
|
|
|
|
|
|
|
trConfig, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
|
2020-01-28 22:33:59 +00:00
|
|
|
// set a consul token on the Nomad client's consul config, because that is
|
|
|
|
// what gates the action of requesting SI token(s)
|
|
|
|
trConfig.ClientConfig.ConsulConfig.Token = uuid.Generate()
|
|
|
|
|
2020-01-15 16:41:52 +00:00
|
|
|
// control when we get a Consul SI token (recoverable failure on first call)
|
2020-01-15 15:29:47 +00:00
|
|
|
token := uuid.Generate()
|
2019-12-06 20:46:46 +00:00
|
|
|
deriveCount := 0
|
|
|
|
deriveFn := func(*structs.Allocation, []string) (map[string]string, error) {
|
|
|
|
if deriveCount > 0 {
|
2019-12-18 16:23:16 +00:00
|
|
|
|
2019-12-19 23:40:30 +00:00
|
|
|
return map[string]string{task.Name: token}, nil
|
2019-12-06 20:46:46 +00:00
|
|
|
}
|
|
|
|
deriveCount++
|
|
|
|
return nil, structs.NewRecoverableError(errors.New("try again later"), true)
|
|
|
|
}
|
|
|
|
siClient := trConfig.ConsulSI.(*consulapi.MockServiceIdentitiesClient)
|
|
|
|
siClient.DeriveTokenFn = deriveFn
|
|
|
|
|
|
|
|
// start the task runner
|
|
|
|
tr, err := NewTaskRunner(trConfig)
|
|
|
|
r.NoError(err)
|
|
|
|
defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
2019-12-18 16:23:16 +00:00
|
|
|
useMockEnvoyBootstrapHook(tr) // mock the envoy bootstrap
|
2019-12-06 20:46:46 +00:00
|
|
|
go tr.Run()
|
|
|
|
|
|
|
|
// assert task runner blocks on SI token
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
|
|
|
case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second):
|
|
|
|
r.Fail("timed out waiting for task runner")
|
|
|
|
}
|
|
|
|
|
|
|
|
// assert task exited successfully
|
|
|
|
finalState := tr.TaskState()
|
|
|
|
r.Equal(structs.TaskStateDead, finalState.State)
|
|
|
|
r.False(finalState.Failed)
|
|
|
|
|
|
|
|
// assert the token is on disk
|
|
|
|
tokenPath := filepath.Join(trConfig.TaskDir.SecretsDir, sidsTokenFile)
|
|
|
|
data, err := ioutil.ReadFile(tokenPath)
|
|
|
|
r.NoError(err)
|
|
|
|
r.Equal(token, string(data))
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestTaskRunner_DeriveSIToken_Unrecoverable asserts that an unrecoverable error
|
|
|
|
// from deriving a service identity token will fail a task.
|
|
|
|
func TestTaskRunner_DeriveSIToken_Unrecoverable(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-12-06 20:46:46 +00:00
|
|
|
r := require.New(t)
|
|
|
|
|
|
|
|
alloc := mock.BatchConnectAlloc()
|
|
|
|
tg := alloc.Job.TaskGroups[0]
|
|
|
|
tg.RestartPolicy.Attempts = 0
|
|
|
|
tg.RestartPolicy.Interval = 0
|
|
|
|
tg.RestartPolicy.Delay = 0
|
|
|
|
tg.RestartPolicy.Mode = structs.RestartPolicyModeFail
|
|
|
|
task := tg.Tasks[0]
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"run_for": "0s",
|
|
|
|
}
|
|
|
|
|
|
|
|
trConfig, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
|
2020-01-28 22:33:59 +00:00
|
|
|
// set a consul token on the Nomad client's consul config, because that is
|
|
|
|
// what gates the action of requesting SI token(s)
|
|
|
|
trConfig.ClientConfig.ConsulConfig.Token = uuid.Generate()
|
|
|
|
|
2019-12-06 20:46:46 +00:00
|
|
|
// SI token derivation suffers a non-retryable error
|
|
|
|
siClient := trConfig.ConsulSI.(*consulapi.MockServiceIdentitiesClient)
|
2019-12-19 23:40:30 +00:00
|
|
|
siClient.SetDeriveTokenError(alloc.ID, []string{task.Name}, errors.New("non-recoverable"))
|
2019-12-06 20:46:46 +00:00
|
|
|
|
|
|
|
tr, err := NewTaskRunner(trConfig)
|
|
|
|
r.NoError(err)
|
|
|
|
|
|
|
|
defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
2019-12-18 16:23:16 +00:00
|
|
|
useMockEnvoyBootstrapHook(tr) // mock the envoy bootstrap hook
|
2019-12-06 20:46:46 +00:00
|
|
|
go tr.Run()
|
|
|
|
|
|
|
|
// Wait for the task to die
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
|
|
|
case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second):
|
|
|
|
require.Fail(t, "timed out waiting for task runner to fail")
|
|
|
|
}
|
|
|
|
|
|
|
|
// assert we have died and failed
|
|
|
|
finalState := tr.TaskState()
|
|
|
|
r.Equal(structs.TaskStateDead, finalState.State)
|
|
|
|
r.True(finalState.Failed)
|
|
|
|
r.Equal(5, len(finalState.Events))
|
|
|
|
/*
|
|
|
|
+ event: Task received by client
|
|
|
|
+ event: Building Task Directory
|
|
|
|
+ event: consul: failed to derive SI token: non-recoverable
|
|
|
|
+ event: consul_sids: context canceled
|
|
|
|
+ event: Policy allows no restarts
|
|
|
|
*/
|
|
|
|
r.Equal("true", finalState.Events[2].Details["fails_task"])
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestTaskRunner_BlockForVaultToken asserts tasks do not start until a vault token
|
2019-02-12 21:46:09 +00:00
|
|
|
// is derived.
|
2019-12-06 20:46:46 +00:00
|
|
|
func TestTaskRunner_BlockForVaultToken(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-02-12 21:46:09 +00:00
|
|
|
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"run_for": "0s",
|
|
|
|
}
|
|
|
|
task.Vault = &structs.Vault{Policies: []string{"default"}}
|
|
|
|
|
|
|
|
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
// Control when we get a Vault token
|
|
|
|
token := "1234"
|
|
|
|
waitCh := make(chan struct{})
|
|
|
|
handler := func(*structs.Allocation, []string) (map[string]string, error) {
|
|
|
|
<-waitCh
|
|
|
|
return map[string]string{task.Name: token}, nil
|
|
|
|
}
|
|
|
|
vaultClient := conf.Vault.(*vaultclient.MockVaultClient)
|
|
|
|
vaultClient.DeriveTokenFn = handler
|
|
|
|
|
|
|
|
tr, err := NewTaskRunner(conf)
|
|
|
|
require.NoError(t, err)
|
|
|
|
defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
|
|
|
go tr.Run()
|
|
|
|
|
|
|
|
// Assert TR blocks on vault token (does *not* exit)
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
|
|
|
require.Fail(t, "tr exited before vault unblocked")
|
|
|
|
case <-time.After(1 * time.Second):
|
|
|
|
}
|
|
|
|
|
|
|
|
// Assert task state is still Pending
|
|
|
|
require.Equal(t, structs.TaskStatePending, tr.TaskState().State)
|
|
|
|
|
|
|
|
// Unblock vault token
|
|
|
|
close(waitCh)
|
|
|
|
|
|
|
|
// TR should exit now that it's unblocked by vault as its a batch job
|
|
|
|
// with 0 sleeping.
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
2019-02-14 16:12:06 +00:00
|
|
|
case <-time.After(15 * time.Second * time.Duration(testutil.TestMultiplier())):
|
2019-02-12 21:46:09 +00:00
|
|
|
require.Fail(t, "timed out waiting for batch task to exit")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Assert task exited successfully
|
|
|
|
finalState := tr.TaskState()
|
|
|
|
require.Equal(t, structs.TaskStateDead, finalState.State)
|
|
|
|
require.False(t, finalState.Failed)
|
|
|
|
|
|
|
|
// Check that the token is on disk
|
|
|
|
tokenPath := filepath.Join(conf.TaskDir.SecretsDir, vaultTokenFile)
|
|
|
|
data, err := ioutil.ReadFile(tokenPath)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, token, string(data))
|
|
|
|
|
|
|
|
// Check the token was revoked
|
|
|
|
testutil.WaitForResult(func() (bool, error) {
|
|
|
|
if len(vaultClient.StoppedTokens()) != 1 {
|
2019-02-14 16:11:37 +00:00
|
|
|
return false, fmt.Errorf("Expected a stopped token %q but found: %v", token, vaultClient.StoppedTokens())
|
2019-02-12 21:46:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if a := vaultClient.StoppedTokens()[0]; a != token {
|
|
|
|
return false, fmt.Errorf("got stopped token %q; want %q", a, token)
|
|
|
|
}
|
|
|
|
return true, nil
|
|
|
|
}, func(err error) {
|
|
|
|
require.Fail(t, err.Error())
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2019-02-12 23:12:56 +00:00
|
|
|
// TestTaskRunner_DeriveToken_Retry asserts that if a recoverable error is
|
|
|
|
// returned when deriving a vault token a task will continue to block while
|
|
|
|
// it's retried.
|
|
|
|
func TestTaskRunner_DeriveToken_Retry(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-02-12 23:12:56 +00:00
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Vault = &structs.Vault{Policies: []string{"default"}}
|
|
|
|
|
|
|
|
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
// Fail on the first attempt to derive a vault token
|
|
|
|
token := "1234"
|
|
|
|
count := 0
|
|
|
|
handler := func(*structs.Allocation, []string) (map[string]string, error) {
|
|
|
|
if count > 0 {
|
|
|
|
return map[string]string{task.Name: token}, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
count++
|
|
|
|
return nil, structs.NewRecoverableError(fmt.Errorf("Want a retry"), true)
|
|
|
|
}
|
|
|
|
vaultClient := conf.Vault.(*vaultclient.MockVaultClient)
|
|
|
|
vaultClient.DeriveTokenFn = handler
|
|
|
|
|
|
|
|
tr, err := NewTaskRunner(conf)
|
|
|
|
require.NoError(t, err)
|
|
|
|
defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
|
|
|
go tr.Run()
|
|
|
|
|
|
|
|
// Wait for TR to exit and check its state
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
|
|
|
case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second):
|
|
|
|
require.Fail(t, "timed out waiting for task runner to exit")
|
|
|
|
}
|
|
|
|
|
|
|
|
state := tr.TaskState()
|
|
|
|
require.Equal(t, structs.TaskStateDead, state.State)
|
|
|
|
require.False(t, state.Failed)
|
|
|
|
|
|
|
|
require.Equal(t, 1, count)
|
|
|
|
|
|
|
|
// Check that the token is on disk
|
|
|
|
tokenPath := filepath.Join(conf.TaskDir.SecretsDir, vaultTokenFile)
|
|
|
|
data, err := ioutil.ReadFile(tokenPath)
|
|
|
|
require.NoError(t, err)
|
|
|
|
require.Equal(t, token, string(data))
|
|
|
|
|
|
|
|
// Check the token was revoked
|
|
|
|
testutil.WaitForResult(func() (bool, error) {
|
|
|
|
if len(vaultClient.StoppedTokens()) != 1 {
|
|
|
|
return false, fmt.Errorf("Expected a stopped token: %v", vaultClient.StoppedTokens())
|
|
|
|
}
|
|
|
|
|
|
|
|
if a := vaultClient.StoppedTokens()[0]; a != token {
|
|
|
|
return false, fmt.Errorf("got stopped token %q; want %q", a, token)
|
|
|
|
}
|
|
|
|
return true, nil
|
|
|
|
}, func(err error) {
|
|
|
|
require.Fail(t, err.Error())
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestTaskRunner_DeriveToken_Unrecoverable asserts that an unrecoverable error
|
|
|
|
// from deriving a vault token will fail a task.
|
|
|
|
func TestTaskRunner_DeriveToken_Unrecoverable(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-02-12 23:12:56 +00:00
|
|
|
|
|
|
|
// Use a batch job with no restarts
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
tg := alloc.Job.TaskGroups[0]
|
|
|
|
tg.RestartPolicy.Attempts = 0
|
|
|
|
tg.RestartPolicy.Interval = 0
|
|
|
|
tg.RestartPolicy.Delay = 0
|
|
|
|
tg.RestartPolicy.Mode = structs.RestartPolicyModeFail
|
|
|
|
task := tg.Tasks[0]
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"run_for": "0s",
|
|
|
|
}
|
|
|
|
task.Vault = &structs.Vault{Policies: []string{"default"}}
|
|
|
|
|
|
|
|
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
// Error the token derivation
|
|
|
|
vaultClient := conf.Vault.(*vaultclient.MockVaultClient)
|
|
|
|
vaultClient.SetDeriveTokenError(alloc.ID, []string{task.Name}, fmt.Errorf("Non recoverable"))
|
|
|
|
|
|
|
|
tr, err := NewTaskRunner(conf)
|
|
|
|
require.NoError(t, err)
|
|
|
|
defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
|
|
|
go tr.Run()
|
|
|
|
|
|
|
|
// Wait for the task to die
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
|
|
|
case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second):
|
|
|
|
require.Fail(t, "timed out waiting for task runner to fail")
|
|
|
|
}
|
|
|
|
|
|
|
|
// Task should be dead and last event should have failed task
|
|
|
|
state := tr.TaskState()
|
|
|
|
require.Equal(t, structs.TaskStateDead, state.State)
|
|
|
|
require.True(t, state.Failed)
|
|
|
|
require.Len(t, state.Events, 3)
|
|
|
|
require.True(t, state.Events[2].FailsTask)
|
|
|
|
}
|
|
|
|
|
2019-04-01 21:17:42 +00:00
|
|
|
// TestTaskRunner_Download_ChrootExec asserts that downloaded artifacts may be
|
|
|
|
// executed in a chroot.
|
|
|
|
func TestTaskRunner_Download_ChrootExec(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-04-01 21:17:42 +00:00
|
|
|
ctestutil.ExecCompatible(t)
|
|
|
|
|
|
|
|
ts := httptest.NewServer(http.FileServer(http.Dir(filepath.Dir("."))))
|
|
|
|
defer ts.Close()
|
|
|
|
|
|
|
|
// Create a task that downloads a script and executes it.
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
alloc.Job.TaskGroups[0].RestartPolicy = &structs.RestartPolicy{}
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
2020-03-25 01:52:39 +00:00
|
|
|
task.RestartPolicy = &structs.RestartPolicy{}
|
2019-04-01 21:17:42 +00:00
|
|
|
task.Driver = "exec"
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"command": "noop.sh",
|
|
|
|
}
|
2022-04-19 14:13:38 +00:00
|
|
|
|
2019-04-01 21:17:42 +00:00
|
|
|
task.Artifacts = []*structs.TaskArtifact{
|
|
|
|
{
|
|
|
|
GetterSource: fmt.Sprintf("%s/testdata/noop.sh", ts.URL),
|
|
|
|
GetterMode: "file",
|
|
|
|
RelativeDest: "noop.sh",
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
tr, _, cleanup := runTestTaskRunner(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
// Wait for task to run and exit
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
2019-04-02 18:17:12 +00:00
|
|
|
case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second):
|
2019-04-01 21:17:42 +00:00
|
|
|
require.Fail(t, "timed out waiting for task runner to exit")
|
|
|
|
}
|
|
|
|
|
|
|
|
state := tr.TaskState()
|
|
|
|
require.Equal(t, structs.TaskStateDead, state.State)
|
|
|
|
require.False(t, state.Failed)
|
|
|
|
}
|
|
|
|
|
|
|
|
// TestTaskRunner_Download_Exec asserts that downloaded artifacts may be
|
|
|
|
// executed in a driver without filesystem isolation.
|
|
|
|
func TestTaskRunner_Download_RawExec(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-04-01 21:17:42 +00:00
|
|
|
|
|
|
|
ts := httptest.NewServer(http.FileServer(http.Dir(filepath.Dir("."))))
|
|
|
|
defer ts.Close()
|
|
|
|
|
|
|
|
// Create a task that downloads a script and executes it.
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
alloc.Job.TaskGroups[0].RestartPolicy = &structs.RestartPolicy{}
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
2020-03-25 01:52:39 +00:00
|
|
|
task.RestartPolicy = &structs.RestartPolicy{}
|
2019-04-01 21:17:42 +00:00
|
|
|
task.Driver = "raw_exec"
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"command": "noop.sh",
|
|
|
|
}
|
cgroups: make sure cgroup still exists after task restart
This PR modifies raw_exec and exec to ensure the cgroup for a task
they are driving still exists during a task restart. These drivers
have the same bug but with different root cause.
For raw_exec, we were removing the cgroup in 2 places - the cpuset
manager, and in the unix containment implementation (the thing that
uses freezer cgroup to clean house). During a task restart, the
containment would remove the cgroup, and when the task runner hooks
went to start again would block on waiting for the cgroup to exist,
which will never happen, because it gets created by the cpuset manager
which only runs as an alloc pre-start hook. The fix here is to simply
not delete the cgroup in the containment implementation; killing the
PIDs is enough. The removal happens in the cpuset manager later anyway.
For exec, it's the same idea, except DestroyTask is called on task
failure, which in turn calls into libcontainer, which in turn deletes
the cgroup. In this case we do not have control over the deletion of
the cgroup, so instead we hack the cgroup back into life after the
call to DestroyTask.
All of this only applies to cgroups v2.
2022-05-04 18:51:53 +00:00
|
|
|
task.Env = map[string]string{
|
|
|
|
"NOMAD_PARENT_CGROUP": "nomad.slice",
|
|
|
|
"NOMAD_ALLOC_ID": alloc.ID,
|
|
|
|
"NOMAD_TASK_NAME": task.Name,
|
|
|
|
}
|
2019-04-01 21:17:42 +00:00
|
|
|
task.Artifacts = []*structs.TaskArtifact{
|
|
|
|
{
|
|
|
|
GetterSource: fmt.Sprintf("%s/testdata/noop.sh", ts.URL),
|
|
|
|
GetterMode: "file",
|
|
|
|
RelativeDest: "noop.sh",
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
tr, _, cleanup := runTestTaskRunner(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
// Wait for task to run and exit
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
|
|
|
case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second):
|
|
|
|
require.Fail(t, "timed out waiting for task runner to exit")
|
|
|
|
}
|
|
|
|
|
|
|
|
state := tr.TaskState()
|
|
|
|
require.Equal(t, structs.TaskStateDead, state.State)
|
|
|
|
require.False(t, state.Failed)
|
|
|
|
}
|
|
|
|
|
2019-02-12 23:48:04 +00:00
|
|
|
// TestTaskRunner_Download_List asserts that multiple artificats are downloaded
|
|
|
|
// before a task is run.
|
|
|
|
func TestTaskRunner_Download_List(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-02-12 23:48:04 +00:00
|
|
|
ts := httptest.NewServer(http.FileServer(http.Dir(filepath.Dir("."))))
|
|
|
|
defer ts.Close()
|
|
|
|
|
|
|
|
// Create an allocation that has a task with a list of artifacts.
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
f1 := "task_runner_test.go"
|
|
|
|
f2 := "task_runner.go"
|
|
|
|
artifact1 := structs.TaskArtifact{
|
|
|
|
GetterSource: fmt.Sprintf("%s/%s", ts.URL, f1),
|
|
|
|
}
|
|
|
|
artifact2 := structs.TaskArtifact{
|
|
|
|
GetterSource: fmt.Sprintf("%s/%s", ts.URL, f2),
|
|
|
|
}
|
|
|
|
task.Artifacts = []*structs.TaskArtifact{&artifact1, &artifact2}
|
|
|
|
|
2019-02-13 16:25:25 +00:00
|
|
|
tr, conf, cleanup := runTestTaskRunner(t, alloc, task.Name)
|
2019-02-12 23:48:04 +00:00
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
// Wait for task to run and exit
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
|
|
|
case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second):
|
|
|
|
require.Fail(t, "timed out waiting for task runner to exit")
|
|
|
|
}
|
|
|
|
|
|
|
|
state := tr.TaskState()
|
|
|
|
require.Equal(t, structs.TaskStateDead, state.State)
|
|
|
|
require.False(t, state.Failed)
|
|
|
|
|
|
|
|
require.Len(t, state.Events, 5)
|
|
|
|
assert.Equal(t, structs.TaskReceived, state.Events[0].Type)
|
|
|
|
assert.Equal(t, structs.TaskSetup, state.Events[1].Type)
|
|
|
|
assert.Equal(t, structs.TaskDownloadingArtifacts, state.Events[2].Type)
|
|
|
|
assert.Equal(t, structs.TaskStarted, state.Events[3].Type)
|
|
|
|
assert.Equal(t, structs.TaskTerminated, state.Events[4].Type)
|
|
|
|
|
|
|
|
// Check that both files exist.
|
2019-02-13 16:25:25 +00:00
|
|
|
_, err := os.Stat(filepath.Join(conf.TaskDir.Dir, f1))
|
2019-02-12 23:48:04 +00:00
|
|
|
require.NoErrorf(t, err, "%v not downloaded", f1)
|
|
|
|
|
|
|
|
_, err = os.Stat(filepath.Join(conf.TaskDir.Dir, f2))
|
|
|
|
require.NoErrorf(t, err, "%v not downloaded", f2)
|
|
|
|
}
|
|
|
|
|
2019-02-13 16:26:23 +00:00
|
|
|
// TestTaskRunner_Download_Retries asserts that failed artifact downloads are
|
|
|
|
// retried according to the task's restart policy.
|
|
|
|
func TestTaskRunner_Download_Retries(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-02-13 16:26:23 +00:00
|
|
|
|
|
|
|
// Create an allocation that has a task with bad artifacts.
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
artifact := structs.TaskArtifact{
|
|
|
|
GetterSource: "http://127.0.0.1:0/foo/bar/baz",
|
|
|
|
}
|
|
|
|
task.Artifacts = []*structs.TaskArtifact{&artifact}
|
|
|
|
|
|
|
|
// Make the restart policy retry once
|
2020-03-25 01:52:39 +00:00
|
|
|
rp := &structs.RestartPolicy{
|
2019-02-13 16:26:23 +00:00
|
|
|
Attempts: 1,
|
|
|
|
Interval: 10 * time.Minute,
|
|
|
|
Delay: 1 * time.Second,
|
|
|
|
Mode: structs.RestartPolicyModeFail,
|
|
|
|
}
|
2020-03-25 01:52:39 +00:00
|
|
|
alloc.Job.TaskGroups[0].RestartPolicy = rp
|
|
|
|
alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy = rp
|
2019-02-13 16:26:23 +00:00
|
|
|
|
|
|
|
tr, _, cleanup := runTestTaskRunner(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
|
|
|
case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second):
|
|
|
|
require.Fail(t, "timed out waiting for task to exit")
|
|
|
|
}
|
|
|
|
|
|
|
|
state := tr.TaskState()
|
|
|
|
require.Equal(t, structs.TaskStateDead, state.State)
|
|
|
|
require.True(t, state.Failed)
|
|
|
|
require.Len(t, state.Events, 8, pretty.Sprint(state.Events))
|
|
|
|
require.Equal(t, structs.TaskReceived, state.Events[0].Type)
|
|
|
|
require.Equal(t, structs.TaskSetup, state.Events[1].Type)
|
|
|
|
require.Equal(t, structs.TaskDownloadingArtifacts, state.Events[2].Type)
|
|
|
|
require.Equal(t, structs.TaskArtifactDownloadFailed, state.Events[3].Type)
|
|
|
|
require.Equal(t, structs.TaskRestarting, state.Events[4].Type)
|
|
|
|
require.Equal(t, structs.TaskDownloadingArtifacts, state.Events[5].Type)
|
|
|
|
require.Equal(t, structs.TaskArtifactDownloadFailed, state.Events[6].Type)
|
|
|
|
require.Equal(t, structs.TaskNotRestarting, state.Events[7].Type)
|
|
|
|
}
|
|
|
|
|
2019-02-13 22:51:05 +00:00
|
|
|
// TestTaskRunner_DriverNetwork asserts that a driver's network is properly
|
|
|
|
// used in services and checks.
|
|
|
|
func TestTaskRunner_DriverNetwork(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-02-13 22:51:05 +00:00
|
|
|
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Driver = "mock_driver"
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"run_for": "100s",
|
|
|
|
"driver_ip": "10.1.2.3",
|
|
|
|
"driver_port_map": "http:80",
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create services and checks with custom address modes to exercise
|
|
|
|
// address detection logic
|
|
|
|
task.Services = []*structs.Service{
|
|
|
|
{
|
|
|
|
Name: "host-service",
|
|
|
|
PortLabel: "http",
|
|
|
|
AddressMode: "host",
|
2022-03-21 09:29:57 +00:00
|
|
|
Provider: structs.ServiceProviderConsul,
|
2019-02-13 22:51:05 +00:00
|
|
|
Checks: []*structs.ServiceCheck{
|
|
|
|
{
|
|
|
|
Name: "driver-check",
|
|
|
|
Type: "tcp",
|
|
|
|
PortLabel: "1234",
|
|
|
|
AddressMode: "driver",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Name: "driver-service",
|
|
|
|
PortLabel: "5678",
|
|
|
|
AddressMode: "driver",
|
2022-03-21 09:29:57 +00:00
|
|
|
Provider: structs.ServiceProviderConsul,
|
2019-02-13 22:51:05 +00:00
|
|
|
Checks: []*structs.ServiceCheck{
|
|
|
|
{
|
|
|
|
Name: "host-check",
|
|
|
|
Type: "tcp",
|
|
|
|
PortLabel: "http",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Name: "driver-label-check",
|
|
|
|
Type: "tcp",
|
|
|
|
PortLabel: "http",
|
|
|
|
AddressMode: "driver",
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
// Use a mock agent to test for services
|
2021-06-07 15:54:33 +00:00
|
|
|
consulAgent := agentconsul.NewMockAgent(agentconsul.Features{
|
|
|
|
Enterprise: false,
|
|
|
|
Namespaces: false,
|
|
|
|
})
|
|
|
|
namespacesClient := agentconsul.NewNamespacesClient(agentconsul.NewMockNamespaces(nil), consulAgent)
|
2021-03-16 18:22:21 +00:00
|
|
|
consulClient := agentconsul.NewServiceClient(consulAgent, namespacesClient, conf.Logger, true)
|
2019-02-13 22:51:05 +00:00
|
|
|
defer consulClient.Shutdown()
|
|
|
|
go consulClient.Run()
|
|
|
|
|
|
|
|
conf.Consul = consulClient
|
2022-03-21 09:29:57 +00:00
|
|
|
conf.ServiceRegWrapper = wrapper.NewHandlerWrapper(conf.Logger, consulClient, nil)
|
2019-02-13 22:51:05 +00:00
|
|
|
|
|
|
|
tr, err := NewTaskRunner(conf)
|
|
|
|
require.NoError(t, err)
|
|
|
|
defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
|
|
|
go tr.Run()
|
|
|
|
|
|
|
|
// Wait for the task to start
|
|
|
|
testWaitForTaskToStart(t, tr)
|
|
|
|
|
|
|
|
testutil.WaitForResult(func() (bool, error) {
|
2021-03-16 18:22:21 +00:00
|
|
|
services, _ := consulAgent.ServicesWithFilterOpts("", nil)
|
2019-02-13 22:51:05 +00:00
|
|
|
if n := len(services); n != 2 {
|
|
|
|
return false, fmt.Errorf("expected 2 services, but found %d", n)
|
|
|
|
}
|
|
|
|
for _, s := range services {
|
|
|
|
switch s.Service {
|
|
|
|
case "host-service":
|
|
|
|
if expected := "192.168.0.100"; s.Address != expected {
|
|
|
|
return false, fmt.Errorf("expected host-service to have IP=%s but found %s",
|
|
|
|
expected, s.Address)
|
|
|
|
}
|
|
|
|
case "driver-service":
|
|
|
|
if expected := "10.1.2.3"; s.Address != expected {
|
|
|
|
return false, fmt.Errorf("expected driver-service to have IP=%s but found %s",
|
|
|
|
expected, s.Address)
|
|
|
|
}
|
|
|
|
if expected := 5678; s.Port != expected {
|
|
|
|
return false, fmt.Errorf("expected driver-service to have port=%d but found %d",
|
|
|
|
expected, s.Port)
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
return false, fmt.Errorf("unexpected service: %q", s.Service)
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
checks := consulAgent.CheckRegs()
|
|
|
|
if n := len(checks); n != 3 {
|
|
|
|
return false, fmt.Errorf("expected 3 checks, but found %d", n)
|
|
|
|
}
|
|
|
|
for _, check := range checks {
|
|
|
|
switch check.Name {
|
|
|
|
case "driver-check":
|
|
|
|
if expected := "10.1.2.3:1234"; check.TCP != expected {
|
|
|
|
return false, fmt.Errorf("expected driver-check to have address %q but found %q", expected, check.TCP)
|
|
|
|
}
|
|
|
|
case "driver-label-check":
|
|
|
|
if expected := "10.1.2.3:80"; check.TCP != expected {
|
|
|
|
return false, fmt.Errorf("expected driver-label-check to have address %q but found %q", expected, check.TCP)
|
|
|
|
}
|
|
|
|
case "host-check":
|
|
|
|
if expected := "192.168.0.100:"; !strings.HasPrefix(check.TCP, expected) {
|
|
|
|
return false, fmt.Errorf("expected host-check to have address start with %q but found %q", expected, check.TCP)
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
return false, fmt.Errorf("unexpected check: %q", check.Name)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return true, nil
|
|
|
|
}, func(err error) {
|
2021-03-16 18:22:21 +00:00
|
|
|
services, _ := consulAgent.ServicesWithFilterOpts("", nil)
|
2019-02-13 22:51:05 +00:00
|
|
|
for _, s := range services {
|
|
|
|
t.Logf(pretty.Sprint("Service: ", s))
|
|
|
|
}
|
|
|
|
for _, c := range consulAgent.CheckRegs() {
|
|
|
|
t.Logf(pretty.Sprint("Check: ", c))
|
|
|
|
}
|
|
|
|
require.NoError(t, err)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2019-02-13 23:19:33 +00:00
|
|
|
// TestTaskRunner_RestartSignalTask_NotRunning asserts resilience to failures
|
|
|
|
// when a restart or signal is triggered and the task is not running.
|
|
|
|
func TestTaskRunner_RestartSignalTask_NotRunning(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-02-13 23:19:33 +00:00
|
|
|
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Driver = "mock_driver"
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"run_for": "0s",
|
|
|
|
}
|
|
|
|
|
|
|
|
// Use vault to block the start
|
|
|
|
task.Vault = &structs.Vault{Policies: []string{"default"}}
|
|
|
|
|
|
|
|
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
// Control when we get a Vault token
|
|
|
|
waitCh := make(chan struct{}, 1)
|
|
|
|
defer close(waitCh)
|
|
|
|
handler := func(*structs.Allocation, []string) (map[string]string, error) {
|
|
|
|
<-waitCh
|
|
|
|
return map[string]string{task.Name: "1234"}, nil
|
|
|
|
}
|
|
|
|
vaultClient := conf.Vault.(*vaultclient.MockVaultClient)
|
|
|
|
vaultClient.DeriveTokenFn = handler
|
|
|
|
|
|
|
|
tr, err := NewTaskRunner(conf)
|
|
|
|
require.NoError(t, err)
|
|
|
|
defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
|
|
|
go tr.Run()
|
|
|
|
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
|
|
|
require.Fail(t, "unexpected exit")
|
|
|
|
case <-time.After(1 * time.Second):
|
|
|
|
}
|
|
|
|
|
|
|
|
// Send a signal and restart
|
|
|
|
err = tr.Signal(structs.NewTaskEvent("don't panic"), "QUIT")
|
|
|
|
require.EqualError(t, err, ErrTaskNotRunning.Error())
|
|
|
|
|
|
|
|
// Send a restart
|
|
|
|
err = tr.Restart(context.Background(), structs.NewTaskEvent("don't panic"), false)
|
|
|
|
require.EqualError(t, err, ErrTaskNotRunning.Error())
|
|
|
|
|
|
|
|
// Unblock and let it finish
|
|
|
|
waitCh <- struct{}{}
|
|
|
|
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
|
|
|
case <-time.After(10 * time.Second):
|
|
|
|
require.Fail(t, "timed out waiting for task to complete")
|
|
|
|
}
|
|
|
|
|
2020-01-07 17:58:29 +00:00
|
|
|
// Assert the task ran and never restarted
|
2019-02-13 23:19:33 +00:00
|
|
|
state := tr.TaskState()
|
|
|
|
require.Equal(t, structs.TaskStateDead, state.State)
|
|
|
|
require.False(t, state.Failed)
|
|
|
|
require.Len(t, state.Events, 4, pretty.Sprint(state.Events))
|
|
|
|
require.Equal(t, structs.TaskReceived, state.Events[0].Type)
|
|
|
|
require.Equal(t, structs.TaskSetup, state.Events[1].Type)
|
|
|
|
require.Equal(t, structs.TaskStarted, state.Events[2].Type)
|
|
|
|
require.Equal(t, structs.TaskTerminated, state.Events[3].Type)
|
|
|
|
}
|
|
|
|
|
2019-02-13 23:34:17 +00:00
|
|
|
// TestTaskRunner_Run_RecoverableStartError asserts tasks are restarted if they
|
|
|
|
// return a recoverable error from StartTask.
|
|
|
|
func TestTaskRunner_Run_RecoverableStartError(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-02-13 23:34:17 +00:00
|
|
|
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"start_error": "driver failure",
|
|
|
|
"start_error_recoverable": true,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Make the restart policy retry once
|
2020-03-25 01:52:39 +00:00
|
|
|
rp := &structs.RestartPolicy{
|
2019-02-13 23:34:17 +00:00
|
|
|
Attempts: 1,
|
|
|
|
Interval: 10 * time.Minute,
|
|
|
|
Delay: 0,
|
|
|
|
Mode: structs.RestartPolicyModeFail,
|
|
|
|
}
|
2020-03-25 01:52:39 +00:00
|
|
|
alloc.Job.TaskGroups[0].RestartPolicy = rp
|
|
|
|
alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy = rp
|
2019-02-13 23:34:17 +00:00
|
|
|
|
|
|
|
tr, _, cleanup := runTestTaskRunner(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
|
|
|
case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second):
|
|
|
|
require.Fail(t, "timed out waiting for task to exit")
|
|
|
|
}
|
|
|
|
|
|
|
|
state := tr.TaskState()
|
|
|
|
require.Equal(t, structs.TaskStateDead, state.State)
|
|
|
|
require.True(t, state.Failed)
|
|
|
|
require.Len(t, state.Events, 6, pretty.Sprint(state.Events))
|
|
|
|
require.Equal(t, structs.TaskReceived, state.Events[0].Type)
|
|
|
|
require.Equal(t, structs.TaskSetup, state.Events[1].Type)
|
|
|
|
require.Equal(t, structs.TaskDriverFailure, state.Events[2].Type)
|
|
|
|
require.Equal(t, structs.TaskRestarting, state.Events[3].Type)
|
|
|
|
require.Equal(t, structs.TaskDriverFailure, state.Events[4].Type)
|
|
|
|
require.Equal(t, structs.TaskNotRestarting, state.Events[5].Type)
|
|
|
|
}
|
|
|
|
|
2019-02-22 03:02:50 +00:00
|
|
|
// TestTaskRunner_Template_Artifact asserts that tasks can use artifacts as templates.
|
|
|
|
func TestTaskRunner_Template_Artifact(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-02-22 03:02:50 +00:00
|
|
|
|
|
|
|
ts := httptest.NewServer(http.FileServer(http.Dir(".")))
|
|
|
|
defer ts.Close()
|
|
|
|
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
f1 := "task_runner.go"
|
|
|
|
f2 := "test"
|
|
|
|
task.Artifacts = []*structs.TaskArtifact{
|
|
|
|
{GetterSource: fmt.Sprintf("%s/%s", ts.URL, f1)},
|
|
|
|
}
|
|
|
|
task.Templates = []*structs.Template{
|
|
|
|
{
|
|
|
|
SourcePath: f1,
|
|
|
|
DestPath: "local/test",
|
|
|
|
ChangeMode: structs.TemplateChangeModeNoop,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
tr, err := NewTaskRunner(conf)
|
|
|
|
require.NoError(t, err)
|
|
|
|
defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
|
|
|
go tr.Run()
|
|
|
|
|
|
|
|
// Wait for task to run and exit
|
|
|
|
select {
|
|
|
|
case <-tr.WaitCh():
|
|
|
|
case <-time.After(15 * time.Second * time.Duration(testutil.TestMultiplier())):
|
|
|
|
require.Fail(t, "timed out waiting for task runner to exit")
|
|
|
|
}
|
|
|
|
|
|
|
|
state := tr.TaskState()
|
|
|
|
require.Equal(t, structs.TaskStateDead, state.State)
|
|
|
|
require.True(t, state.Successful())
|
|
|
|
require.False(t, state.Failed)
|
|
|
|
|
|
|
|
artifactsDownloaded := false
|
|
|
|
for _, e := range state.Events {
|
|
|
|
if e.Type == structs.TaskDownloadingArtifacts {
|
|
|
|
artifactsDownloaded = true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
assert.True(t, artifactsDownloaded, "expected artifacts downloaded events")
|
|
|
|
|
|
|
|
// Check that both files exist.
|
|
|
|
_, err = os.Stat(filepath.Join(conf.TaskDir.Dir, f1))
|
|
|
|
require.NoErrorf(t, err, "%v not downloaded", f1)
|
|
|
|
|
|
|
|
_, err = os.Stat(filepath.Join(conf.TaskDir.LocalDir, f2))
|
|
|
|
require.NoErrorf(t, err, "%v not rendered", f2)
|
|
|
|
}
|
|
|
|
|
2019-10-08 18:34:09 +00:00
|
|
|
// TestTaskRunner_Template_BlockingPreStart asserts that a template
|
|
|
|
// that fails to render in PreStart can gracefully be shutdown by
|
|
|
|
// either killCtx or shutdownCtx
|
|
|
|
func TestTaskRunner_Template_BlockingPreStart(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-10-08 18:34:09 +00:00
|
|
|
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Templates = []*structs.Template{
|
|
|
|
{
|
|
|
|
EmbeddedTmpl: `{{ with secret "foo/secret" }}{{ .Data.certificate }}{{ end }}`,
|
|
|
|
DestPath: "local/test",
|
|
|
|
ChangeMode: structs.TemplateChangeModeNoop,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
task.Vault = &structs.Vault{Policies: []string{"default"}}
|
|
|
|
|
|
|
|
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
tr, err := NewTaskRunner(conf)
|
|
|
|
require.NoError(t, err)
|
|
|
|
go tr.Run()
|
|
|
|
defer tr.Shutdown()
|
|
|
|
|
|
|
|
testutil.WaitForResult(func() (bool, error) {
|
|
|
|
ts := tr.TaskState()
|
|
|
|
|
|
|
|
if len(ts.Events) == 0 {
|
|
|
|
return false, fmt.Errorf("no events yet")
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, e := range ts.Events {
|
|
|
|
if e.Type == "Template" && strings.Contains(e.DisplayMessage, "vault.read(foo/secret)") {
|
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false, fmt.Errorf("no missing vault secret template event yet: %#v", ts.Events)
|
|
|
|
|
|
|
|
}, func(err error) {
|
|
|
|
require.NoError(t, err)
|
|
|
|
})
|
|
|
|
|
|
|
|
shutdown := func() <-chan bool {
|
|
|
|
finished := make(chan bool)
|
|
|
|
go func() {
|
|
|
|
tr.Shutdown()
|
|
|
|
finished <- true
|
|
|
|
}()
|
|
|
|
|
|
|
|
return finished
|
|
|
|
}
|
|
|
|
|
|
|
|
select {
|
|
|
|
case <-shutdown():
|
|
|
|
// it shut down like it should have
|
|
|
|
case <-time.After(10 * time.Second):
|
|
|
|
require.Fail(t, "timeout shutting down task")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-02-22 19:35:21 +00:00
|
|
|
// TestTaskRunner_Template_NewVaultToken asserts that a new vault token is
|
|
|
|
// created when rendering template and that it is revoked on alloc completion
|
2019-02-22 03:23:50 +00:00
|
|
|
func TestTaskRunner_Template_NewVaultToken(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-02-22 03:23:50 +00:00
|
|
|
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Templates = []*structs.Template{
|
|
|
|
{
|
|
|
|
EmbeddedTmpl: `{{key "foo"}}`,
|
|
|
|
DestPath: "local/test",
|
|
|
|
ChangeMode: structs.TemplateChangeModeNoop,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
task.Vault = &structs.Vault{Policies: []string{"default"}}
|
|
|
|
|
|
|
|
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
tr, err := NewTaskRunner(conf)
|
|
|
|
require.NoError(t, err)
|
|
|
|
defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
|
|
|
go tr.Run()
|
|
|
|
|
|
|
|
// Wait for a Vault token
|
|
|
|
var token string
|
|
|
|
testutil.WaitForResult(func() (bool, error) {
|
2019-02-22 19:35:21 +00:00
|
|
|
token = tr.getVaultToken()
|
2019-02-22 03:23:50 +00:00
|
|
|
|
|
|
|
if token == "" {
|
|
|
|
return false, fmt.Errorf("No Vault token")
|
|
|
|
}
|
|
|
|
|
|
|
|
return true, nil
|
|
|
|
}, func(err error) {
|
|
|
|
require.NoError(t, err)
|
|
|
|
})
|
|
|
|
|
|
|
|
vault := conf.Vault.(*vaultclient.MockVaultClient)
|
|
|
|
renewalCh, ok := vault.RenewTokens()[token]
|
|
|
|
require.True(t, ok, "no renewal channel for token")
|
|
|
|
|
|
|
|
renewalCh <- fmt.Errorf("Test killing")
|
|
|
|
close(renewalCh)
|
|
|
|
|
|
|
|
var token2 string
|
|
|
|
testutil.WaitForResult(func() (bool, error) {
|
2019-02-22 19:35:21 +00:00
|
|
|
token2 = tr.getVaultToken()
|
2019-02-22 03:23:50 +00:00
|
|
|
|
|
|
|
if token2 == "" {
|
|
|
|
return false, fmt.Errorf("No Vault token")
|
|
|
|
}
|
|
|
|
|
|
|
|
if token2 == token {
|
|
|
|
return false, fmt.Errorf("token wasn't recreated")
|
|
|
|
}
|
|
|
|
|
|
|
|
return true, nil
|
|
|
|
}, func(err error) {
|
|
|
|
require.NoError(t, err)
|
|
|
|
})
|
|
|
|
|
|
|
|
// Check the token was revoked
|
|
|
|
testutil.WaitForResult(func() (bool, error) {
|
|
|
|
if len(vault.StoppedTokens()) != 1 {
|
|
|
|
return false, fmt.Errorf("Expected a stopped token: %v", vault.StoppedTokens())
|
|
|
|
}
|
|
|
|
|
|
|
|
if a := vault.StoppedTokens()[0]; a != token {
|
|
|
|
return false, fmt.Errorf("got stopped token %q; want %q", a, token)
|
|
|
|
}
|
|
|
|
|
|
|
|
return true, nil
|
|
|
|
}, func(err error) {
|
|
|
|
require.NoError(t, err)
|
|
|
|
})
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2019-02-22 19:35:21 +00:00
|
|
|
// TestTaskRunner_VaultManager_Restart asserts that the alloc is restarted when the alloc
|
|
|
|
// derived vault token expires, when task is configured with Restart change mode
|
2019-02-22 14:20:57 +00:00
|
|
|
func TestTaskRunner_VaultManager_Restart(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-02-22 14:20:57 +00:00
|
|
|
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Config = map[string]interface{}{
|
2019-02-22 19:35:21 +00:00
|
|
|
"run_for": "10s",
|
2019-02-22 14:20:57 +00:00
|
|
|
}
|
|
|
|
task.Vault = &structs.Vault{
|
|
|
|
Policies: []string{"default"},
|
|
|
|
ChangeMode: structs.VaultChangeModeRestart,
|
|
|
|
}
|
|
|
|
|
|
|
|
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
tr, err := NewTaskRunner(conf)
|
|
|
|
require.NoError(t, err)
|
|
|
|
defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
|
|
|
go tr.Run()
|
|
|
|
|
|
|
|
testWaitForTaskToStart(t, tr)
|
|
|
|
|
|
|
|
tr.vaultTokenLock.Lock()
|
|
|
|
token := tr.vaultToken
|
|
|
|
tr.vaultTokenLock.Unlock()
|
|
|
|
|
|
|
|
require.NotEmpty(t, token)
|
|
|
|
|
|
|
|
vault := conf.Vault.(*vaultclient.MockVaultClient)
|
|
|
|
renewalCh, ok := vault.RenewTokens()[token]
|
|
|
|
require.True(t, ok, "no renewal channel for token")
|
|
|
|
|
|
|
|
renewalCh <- fmt.Errorf("Test killing")
|
|
|
|
close(renewalCh)
|
|
|
|
|
|
|
|
testutil.WaitForResult(func() (bool, error) {
|
|
|
|
state := tr.TaskState()
|
|
|
|
|
|
|
|
if len(state.Events) == 0 {
|
|
|
|
return false, fmt.Errorf("no events yet")
|
|
|
|
}
|
|
|
|
|
2019-02-22 19:45:17 +00:00
|
|
|
foundRestartSignal, foundRestarting := false, false
|
2019-02-22 14:20:57 +00:00
|
|
|
for _, e := range state.Events {
|
2019-02-22 19:45:17 +00:00
|
|
|
switch e.Type {
|
|
|
|
case structs.TaskRestartSignal:
|
|
|
|
foundRestartSignal = true
|
|
|
|
case structs.TaskRestarting:
|
2019-02-22 14:20:57 +00:00
|
|
|
foundRestarting = true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-02-22 19:45:17 +00:00
|
|
|
if !foundRestartSignal {
|
|
|
|
return false, fmt.Errorf("no restart signal event yet: %#v", state.Events)
|
|
|
|
}
|
|
|
|
|
2019-02-22 14:20:57 +00:00
|
|
|
if !foundRestarting {
|
|
|
|
return false, fmt.Errorf("no restarting event yet: %#v", state.Events)
|
|
|
|
}
|
|
|
|
|
|
|
|
lastEvent := state.Events[len(state.Events)-1]
|
|
|
|
if lastEvent.Type != structs.TaskStarted {
|
2019-02-22 19:45:17 +00:00
|
|
|
return false, fmt.Errorf("expected last event to be task starting but was %#v", lastEvent)
|
2019-02-22 14:20:57 +00:00
|
|
|
}
|
|
|
|
return true, nil
|
|
|
|
}, func(err error) {
|
|
|
|
require.NoError(t, err)
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2019-02-22 19:35:21 +00:00
|
|
|
// TestTaskRunner_VaultManager_Signal asserts that the alloc is signalled when the alloc
|
|
|
|
// derived vault token expires, when task is configured with signal change mode
|
2019-02-22 14:31:02 +00:00
|
|
|
func TestTaskRunner_VaultManager_Signal(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-02-22 14:31:02 +00:00
|
|
|
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Config = map[string]interface{}{
|
2019-02-22 19:35:21 +00:00
|
|
|
"run_for": "10s",
|
2019-02-22 14:31:02 +00:00
|
|
|
}
|
|
|
|
task.Vault = &structs.Vault{
|
|
|
|
Policies: []string{"default"},
|
|
|
|
ChangeMode: structs.VaultChangeModeSignal,
|
|
|
|
ChangeSignal: "SIGUSR1",
|
|
|
|
}
|
|
|
|
|
|
|
|
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
tr, err := NewTaskRunner(conf)
|
|
|
|
require.NoError(t, err)
|
|
|
|
defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
|
|
|
go tr.Run()
|
|
|
|
|
|
|
|
testWaitForTaskToStart(t, tr)
|
|
|
|
|
|
|
|
tr.vaultTokenLock.Lock()
|
|
|
|
token := tr.vaultToken
|
|
|
|
tr.vaultTokenLock.Unlock()
|
|
|
|
|
|
|
|
require.NotEmpty(t, token)
|
|
|
|
|
|
|
|
vault := conf.Vault.(*vaultclient.MockVaultClient)
|
|
|
|
renewalCh, ok := vault.RenewTokens()[token]
|
|
|
|
require.True(t, ok, "no renewal channel for token")
|
|
|
|
|
|
|
|
renewalCh <- fmt.Errorf("Test killing")
|
|
|
|
close(renewalCh)
|
|
|
|
|
|
|
|
testutil.WaitForResult(func() (bool, error) {
|
|
|
|
state := tr.TaskState()
|
|
|
|
|
|
|
|
if len(state.Events) == 0 {
|
|
|
|
return false, fmt.Errorf("no events yet")
|
|
|
|
}
|
|
|
|
|
|
|
|
foundSignaling := false
|
|
|
|
for _, e := range state.Events {
|
|
|
|
if e.Type == structs.TaskSignaling {
|
|
|
|
foundSignaling = true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if !foundSignaling {
|
|
|
|
return false, fmt.Errorf("no signaling event yet: %#v", state.Events)
|
|
|
|
}
|
|
|
|
|
|
|
|
return true, nil
|
|
|
|
}, func(err error) {
|
|
|
|
require.NoError(t, err)
|
|
|
|
})
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2019-02-22 03:41:41 +00:00
|
|
|
// TestTaskRunner_UnregisterConsul_Retries asserts a task is unregistered from
|
|
|
|
// Consul when waiting to be retried.
|
|
|
|
func TestTaskRunner_UnregisterConsul_Retries(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-02-22 03:41:41 +00:00
|
|
|
|
|
|
|
alloc := mock.Alloc()
|
|
|
|
// Make the restart policy try one ctx.update
|
2020-03-25 01:44:37 +00:00
|
|
|
rp := &structs.RestartPolicy{
|
2019-02-22 03:41:41 +00:00
|
|
|
Attempts: 1,
|
|
|
|
Interval: 10 * time.Minute,
|
|
|
|
Delay: time.Nanosecond,
|
|
|
|
Mode: structs.RestartPolicyModeFail,
|
|
|
|
}
|
2020-03-25 01:44:37 +00:00
|
|
|
alloc.Job.TaskGroups[0].RestartPolicy = rp
|
2019-02-22 03:41:41 +00:00
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
2020-03-25 01:44:37 +00:00
|
|
|
task.RestartPolicy = rp
|
2019-02-22 03:41:41 +00:00
|
|
|
task.Driver = "mock_driver"
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"exit_code": "1",
|
|
|
|
"run_for": "1ns",
|
|
|
|
}
|
|
|
|
|
|
|
|
conf, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
|
|
|
defer cleanup()
|
|
|
|
|
|
|
|
tr, err := NewTaskRunner(conf)
|
|
|
|
require.NoError(t, err)
|
|
|
|
defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
|
|
|
tr.Run()
|
|
|
|
|
|
|
|
state := tr.TaskState()
|
|
|
|
require.Equal(t, structs.TaskStateDead, state.State)
|
|
|
|
|
2022-03-15 08:38:30 +00:00
|
|
|
consul := conf.Consul.(*regMock.ServiceRegistrationHandler)
|
2019-02-22 03:41:41 +00:00
|
|
|
consulOps := consul.GetOps()
|
2022-02-11 08:29:38 +00:00
|
|
|
require.Len(t, consulOps, 4)
|
2019-02-21 23:37:22 +00:00
|
|
|
|
|
|
|
// Initial add
|
2019-02-22 03:41:41 +00:00
|
|
|
require.Equal(t, "add", consulOps[0].Op)
|
2019-02-21 23:37:22 +00:00
|
|
|
|
2021-07-06 14:37:53 +00:00
|
|
|
// Removing entries on first exit
|
2019-02-22 03:41:41 +00:00
|
|
|
require.Equal(t, "remove", consulOps[1].Op)
|
2019-02-21 23:37:22 +00:00
|
|
|
|
|
|
|
// Second add on retry
|
2021-07-06 14:37:53 +00:00
|
|
|
require.Equal(t, "add", consulOps[2].Op)
|
2019-02-21 23:37:22 +00:00
|
|
|
|
2021-07-06 14:37:53 +00:00
|
|
|
// Removing entries on retry
|
|
|
|
require.Equal(t, "remove", consulOps[3].Op)
|
2019-02-22 03:41:41 +00:00
|
|
|
}
|
|
|
|
|
2019-01-17 23:01:49 +00:00
|
|
|
// testWaitForTaskToStart waits for the task to be running or fails the test
|
2019-01-05 00:08:47 +00:00
|
|
|
func testWaitForTaskToStart(t *testing.T, tr *TaskRunner) {
|
|
|
|
testutil.WaitForResult(func() (bool, error) {
|
2019-01-17 23:01:49 +00:00
|
|
|
ts := tr.TaskState()
|
|
|
|
return ts.State == structs.TaskStateRunning, fmt.Errorf("%v", ts.State)
|
2019-01-05 00:08:47 +00:00
|
|
|
}, func(err error) {
|
2019-01-17 23:01:49 +00:00
|
|
|
require.NoError(t, err)
|
2019-01-05 00:08:47 +00:00
|
|
|
})
|
|
|
|
}
|
2019-06-17 16:52:49 +00:00
|
|
|
|
|
|
|
// TestTaskRunner_BaseLabels tests that the base labels for the task metrics
|
|
|
|
// are set appropriately.
|
|
|
|
func TestTaskRunner_BaseLabels(t *testing.T) {
|
2022-03-15 12:42:43 +00:00
|
|
|
ci.Parallel(t)
|
2019-06-17 16:52:49 +00:00
|
|
|
require := require.New(t)
|
|
|
|
|
|
|
|
alloc := mock.BatchAlloc()
|
|
|
|
alloc.Namespace = "not-default"
|
|
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
|
|
task.Driver = "raw_exec"
|
|
|
|
task.Config = map[string]interface{}{
|
|
|
|
"command": "whoami",
|
|
|
|
}
|
|
|
|
|
|
|
|
config, cleanup := testTaskRunnerConfig(t, alloc, task.Name)
|
2019-06-18 14:15:25 +00:00
|
|
|
defer cleanup()
|
2019-06-17 16:52:49 +00:00
|
|
|
|
|
|
|
tr, err := NewTaskRunner(config)
|
|
|
|
require.NoError(err)
|
|
|
|
|
2019-06-18 14:00:57 +00:00
|
|
|
labels := map[string]string{}
|
2019-06-17 16:52:49 +00:00
|
|
|
for _, e := range tr.baseLabels {
|
2019-06-18 14:00:57 +00:00
|
|
|
labels[e.Name] = e.Value
|
2019-06-17 16:52:49 +00:00
|
|
|
}
|
2019-06-18 14:00:57 +00:00
|
|
|
require.Equal(alloc.Job.Name, labels["job"])
|
|
|
|
require.Equal(alloc.TaskGroup, labels["task_group"])
|
|
|
|
require.Equal(task.Name, labels["task"])
|
|
|
|
require.Equal(alloc.ID, labels["alloc_id"])
|
|
|
|
require.Equal(alloc.Namespace, labels["namespace"])
|
2019-06-17 16:52:49 +00:00
|
|
|
}
|