// Copyright (c) HashiCorp, Inc. // SPDX-License-Identifier: MPL-2.0 package command import ( "context" "fmt" "net/http" "net/http/httptest" "net/http/httputil" neturl "net/url" "regexp" "sort" "strings" "sync/atomic" "testing" "time" "github.com/google/go-cmp/cmp/cmpopts" "github.com/hashicorp/go-set" "github.com/hashicorp/nomad/api" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/command/agent" "github.com/hashicorp/nomad/helper/pointer" "github.com/hashicorp/nomad/testutil" "github.com/mitchellh/cli" "github.com/shoenig/test/must" "github.com/shoenig/test/wait" ) func TestJobRestartCommand_Implements(t *testing.T) { ci.Parallel(t) var _ cli.Command = &JobRestartCommand{} } func TestJobRestartCommand_parseAndValidate(t *testing.T) { ci.Parallel(t) testCases := []struct { name string args []string expectedErr string expectedCmd *JobRestartCommand }{ { name: "missing job", args: []string{}, expectedErr: "This command takes one argument", }, { name: "too many args", args: []string{"one", "two", "three"}, expectedErr: "This command takes one argument", }, { name: "tasks and groups", args: []string{ "-task", "my-task-1", "-task", "my-task-2", "-group", "my-group-1", "-group", "my-group-2", "my-job", }, expectedCmd: &JobRestartCommand{ jobID: "my-job", groups: set.From([]string{"my-group-1", "my-group-2"}), tasks: set.From([]string{"my-task-1", "my-task-2"}), batchSize: 1, }, }, { name: "all tasks", args: []string{"-all-tasks", "my-job"}, expectedCmd: &JobRestartCommand{ jobID: "my-job", allTasks: true, batchSize: 1, }, }, { name: "all tasks conflicts with task", args: []string{"-all-tasks", "-task", "my-task", "-yes", "my-job"}, expectedErr: "The -all-tasks option cannot be used with -task", }, { name: "batch size as number", args: []string{"-batch-size", "10", "my-job"}, expectedCmd: &JobRestartCommand{ jobID: "my-job", batchSize: 10, }, }, { name: "batch size as percentage", args: []string{"-batch-size", "10%", "my-job"}, expectedCmd: &JobRestartCommand{ jobID: "my-job", batchSize: 10, batchSizePercent: true, }, }, { name: "batch size not valid", args: []string{"-batch-size", "not-valid", "my-job"}, expectedErr: "Invalid -batch-size value", }, { name: "batch size decimal not valid", args: []string{"-batch-size", "1.5", "my-job"}, expectedErr: "Invalid -batch-size value", }, { name: "batch size zero", args: []string{"-batch-size", "0", "my-job"}, expectedErr: "Invalid -batch-size value", }, { name: "batch size decimal percent not valid", args: []string{"-batch-size", "1.5%", "my-job"}, expectedErr: "Invalid -batch-size value", }, { name: "batch size zero percentage", args: []string{"-batch-size", "0%", "my-job"}, expectedErr: "Invalid -batch-size value", }, { name: "batch size with multiple numbers and percentages", args: []string{"-batch-size", "15%10%", "my-job"}, expectedErr: "Invalid -batch-size value", }, { name: "batch wait ask", args: []string{"-batch-wait", "ask", "my-job"}, expectedErr: "terminal is not interactive", // Can't test non-interactive. }, { name: "batch wait duration", args: []string{"-batch-wait", "10s", "my-job"}, expectedCmd: &JobRestartCommand{ jobID: "my-job", batchSize: 1, batchWait: 10 * time.Second, }, }, { name: "batch wait invalid", args: []string{"-batch-wait", "10", "my-job"}, expectedErr: "Invalid -batch-wait value", }, { name: "on error fail", args: []string{"-on-error", "fail", "my-job"}, expectedCmd: &JobRestartCommand{ jobID: "my-job", batchSize: 1, onError: jobRestartOnErrorFail, }, }, { name: "on error invalid", args: []string{"-on-error", "invalid", "my-job"}, expectedErr: "Invalid -on-error value", }, { name: "no shutdown delay", args: []string{"-no-shutdown-delay", "my-job"}, expectedCmd: &JobRestartCommand{ jobID: "my-job", batchSize: 1, noShutdownDelay: true, }, }, { name: "reschedule", args: []string{"-reschedule", "my-job"}, expectedCmd: &JobRestartCommand{ jobID: "my-job", batchSize: 1, reschedule: true, }, }, { name: "reschedule conflicts with task", args: []string{"-reschedule", "-task", "my-task", "-yes", "my-job"}, expectedErr: "The -reschedule option cannot be used with -task", }, { name: "verbose", args: []string{"-verbose", "my-job"}, expectedCmd: &JobRestartCommand{ jobID: "my-job", batchSize: 1, verbose: true, length: fullId, }, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { ui := &cli.ConcurrentUi{Ui: cli.NewMockUi()} meta := Meta{Ui: ui} // Set some default values if not defined in test case. if tc.expectedCmd != nil { tc.expectedCmd.Meta = meta if tc.expectedCmd.length == 0 { tc.expectedCmd.length = shortId } if tc.expectedCmd.groups == nil { tc.expectedCmd.groups = set.New[string](0) } if tc.expectedCmd.tasks == nil { tc.expectedCmd.tasks = set.New[string](0) } if tc.expectedCmd.onError == "" { tc.expectedCmd.onError = jobRestartOnErrorAsk tc.expectedCmd.autoYes = true tc.args = append([]string{"-yes"}, tc.args...) } } cmd := &JobRestartCommand{Meta: meta} code, err := cmd.parseAndValidate(tc.args) if tc.expectedErr != "" { must.NonZero(t, code) must.ErrorContains(t, err, tc.expectedErr) } else { must.NoError(t, err) must.Zero(t, code) must.Eq(t, tc.expectedCmd, cmd, must.Cmp(cmpopts.IgnoreFields(JobRestartCommand{}, "Meta", "Meta.Ui"))) } }) } } func TestJobRestartCommand_Run(t *testing.T) { ci.Parallel(t) // Create a job with multiple tasks, groups, and allocations. prestartTask := api.NewTask("prestart", "mock_driver"). SetConfig("run_for", "100ms"). SetConfig("exit_code", 0). SetLifecycle(&api.TaskLifecycle{ Hook: api.TaskLifecycleHookPrestart, Sidecar: false, }) sidecarTask := api.NewTask("sidecar", "mock_driver"). SetConfig("run_for", "1m"). SetConfig("exit_code", 0). SetLifecycle(&api.TaskLifecycle{ Hook: api.TaskLifecycleHookPoststart, Sidecar: true, }) mainTask := api.NewTask("main", "mock_driver"). SetConfig("run_for", "1m"). SetConfig("exit_code", 0) jobID := "test_job_restart_cmd" job := api.NewServiceJob(jobID, jobID, "global", 1). AddDatacenter("dc1"). AddTaskGroup( api.NewTaskGroup("single_task", 3). AddTask(mainTask), ). AddTaskGroup( api.NewTaskGroup("multiple_tasks", 2). AddTask(prestartTask). AddTask(sidecarTask). AddTask(mainTask), ) testCases := []struct { name string args []string // Job arg is added automatically. expectedCode int validateFn func(*testing.T, *api.Client, []*api.AllocationListStub, string, string) }{ { name: "restart only running tasks in all groups by default", args: []string{"-batch-size", "100%"}, validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ "single_task": { "main": true, }, "multiple_tasks": { "prestart": false, "sidecar": true, "main": true, }, }) // Check that allocations restarted in a single batch. batches := getRestartBatches(restarted, []string{"single_task", "multiple_tasks"}, "main") must.Len(t, 5, batches[0]) must.StrContains(t, stdout, "Restarting 1st batch") must.StrNotContains(t, stdout, "restarting the next batch") }, }, { name: "restart specific task in all groups", args: []string{"-batch-size", "100%", "-task", "main"}, validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ "single_task": { "main": true, }, "multiple_tasks": { "prestart": false, "sidecar": false, "main": true, }, }) // Check that allocations restarted in a single batch. batches := getRestartBatches(restarted, []string{"single_task", "multiple_tasks"}, "main") must.Len(t, 5, batches[0]) must.StrContains(t, stdout, "Restarting 1st batch") must.StrNotContains(t, stdout, "restarting the next batch") }, }, { name: "restart multiple tasks in all groups", args: []string{"-batch-size", "100%", "-task", "main", "-task", "sidecar"}, validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ "single_task": { "main": true, }, "multiple_tasks": { "prestart": false, "sidecar": true, "main": true, }, }) // Check that allocations restarted in a single batch. batches := getRestartBatches(restarted, []string{"single_task", "multiple_tasks"}, "main") must.Len(t, 5, batches[0]) must.StrContains(t, stdout, "Restarting 1st batch") must.StrNotContains(t, stdout, "restarting the next batch") }, }, { name: "restart all tasks in all groups", args: []string{"-batch-size", "100%", "-all-tasks"}, validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ "single_task": { "main": true, }, "multiple_tasks": { "prestart": true, "sidecar": true, "main": true, }, }) // Check that allocations restarted in a single batch. batches := getRestartBatches(restarted, []string{"single_task", "multiple_tasks"}, "main") must.Len(t, 5, batches[0]) must.StrContains(t, stdout, "Restarting 1st batch") must.StrNotContains(t, stdout, "restarting the next batch") }, }, { name: "restart running tasks in specific group", args: []string{"-batch-size", "100%", "-group", "single_task"}, validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ "single_task": { "main": true, }, "multiple_tasks": { "prestart": false, "sidecar": false, "main": false, }, }) // Check that allocations restarted in a single batch. batches := getRestartBatches(restarted, []string{"single_task"}, "main") must.Len(t, 3, batches[0]) must.StrContains(t, stdout, "Restarting 1st batch") must.StrNotContains(t, stdout, "restarting the next batch") }, }, { name: "restart specific task that is not running", args: []string{"-batch-size", "100%", "-task", "prestart"}, validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ "single_task": { "main": false, }, "multiple_tasks": { "prestart": false, "sidecar": false, "main": false, }, }) // Check that allocations restarted in a single batch. batches := getRestartBatches(restarted, []string{"single_task"}, "main") must.Len(t, 3, batches[0]) must.StrContains(t, stdout, "Restarting 1st batch") must.StrNotContains(t, stdout, "restarting the next batch") // Check that we have an error message. must.StrContains(t, stderr, "Task not running") }, expectedCode: 1, }, { name: "restart specific task in specific group", args: []string{"-batch-size", "100%", "-task", "main", "-group", "single_task"}, validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ "single_task": { "main": true, }, "multiple_tasks": { "prestart": false, "sidecar": false, "main": false, }, }) // Check that allocations restarted in a single batch. batches := getRestartBatches(restarted, []string{"single_task"}, "main") must.Len(t, 3, batches[0]) must.StrContains(t, stdout, "Restarting 1st batch") must.StrNotContains(t, stdout, "restarting the next batch") }, }, { name: "restart multiple tasks in specific group", args: []string{"-batch-size", "100%", "-task", "main", "-task", "sidecar", "-group", "multiple_tasks"}, validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ "single_task": { "main": false, }, "multiple_tasks": { "prestart": false, "sidecar": true, "main": true, }, }) // Check that allocations restarted in a single batch. batches := getRestartBatches(restarted, []string{"multiple_tasks"}, "main") must.Len(t, 2, batches[0]) must.StrContains(t, stdout, "Restarting 1st batch") must.StrNotContains(t, stdout, "restarting the next batch") }, }, { name: "restart all tasks in specific group", args: []string{"-batch-size", "100%", "-all-tasks", "-group", "multiple_tasks"}, validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ "single_task": { "main": false, }, "multiple_tasks": { "prestart": true, "sidecar": true, "main": true, }, }) // Check that allocations restarted in a single batch. batches := getRestartBatches(restarted, []string{"multiple_tasks"}, "main") must.Len(t, 2, batches[0]) must.StrContains(t, stdout, "Restarting 1st batch") must.StrNotContains(t, stdout, "restarting the next batch") }, }, { name: "restart in batches", args: []string{"-batch-size", "3", "-batch-wait", "3s", "-task", "main"}, validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ "single_task": { "main": true, }, "multiple_tasks": { "prestart": false, "sidecar": false, "main": true, }, }) // Check that allocations were properly batched. batches := getRestartBatches(restarted, []string{"multiple_tasks", "single_task"}, "main") must.Len(t, 3, batches[0]) must.StrContains(t, stdout, "Restarting 1st batch of 3 allocations") must.Len(t, 2, batches[1]) must.StrContains(t, stdout, "Restarting 2nd batch of 2 allocations") // Check that we only waited between batches. waitMsgCount := strings.Count(stdout, "Waiting 3s before restarting the next batch") must.Eq(t, 1, waitMsgCount) // Check that batches waited the expected time. batch1Restart := batches[0][0].TaskStates["main"].LastRestart batch2Restart := batches[1][0].TaskStates["main"].LastRestart diff := batch2Restart.Sub(batch1Restart) must.Between(t, 3*time.Second, diff, 4*time.Second) }, }, { name: "restart in percent batch", args: []string{"-batch-size", "50%", "-batch-wait", "3s", "-task", "main"}, validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ "single_task": { "main": true, }, "multiple_tasks": { "prestart": false, "sidecar": false, "main": true, }, }) // Check that allocations were properly batched. batches := getRestartBatches(restarted, []string{"multiple_tasks", "single_task"}, "main") must.Len(t, 3, batches[0]) must.StrContains(t, stdout, "Restarting 1st batch of 3 allocations") must.Len(t, 2, batches[1]) must.StrContains(t, stdout, "Restarting 2nd batch of 2 allocations") // Check that we only waited between batches. waitMsgCount := strings.Count(stdout, "Waiting 3s before restarting the next batch") must.Eq(t, 1, waitMsgCount) // Check that batches waited the expected time. batch1Restart := batches[0][0].TaskStates["main"].LastRestart batch2Restart := batches[1][0].TaskStates["main"].LastRestart diff := batch2Restart.Sub(batch1Restart) must.Between(t, 3*time.Second, diff, 4*time.Second) }, }, { name: "restart in batch ask with yes", args: []string{"-batch-size", "100%", "-batch-wait", "ask", "-yes", "-group", "single_task"}, validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { restarted := waitTasksRestarted(t, client, allocs, map[string]map[string]bool{ "single_task": { "main": true, }, "multiple_tasks": { "prestart": false, "sidecar": false, "main": false, }, }) // Check that allocations restarted in a single batch. batches := getRestartBatches(restarted, []string{"single_task"}, "main") must.Len(t, 3, batches[0]) must.StrContains(t, stdout, "Restarting 1st batch") must.StrNotContains(t, stdout, "restarting the next batch") }, }, { name: "reschedule in batches", args: []string{"-reschedule", "-batch-size", "3"}, validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { // Expect all allocations were rescheduled. reschedules := map[string]bool{} for _, alloc := range allocs { reschedules[alloc.ID] = true } waitAllocsRescheduled(t, client, reschedules) // Check that allocations were properly batched. must.StrContains(t, stdout, "Restarting 1st batch of 3 allocations") must.StrContains(t, stdout, "Restarting 2nd batch of 2 allocations") must.StrNotContains(t, stdout, "Waiting") }, }, { name: "reschedule specific group", args: []string{"-reschedule", "-batch-size", "100%", "-group", "single_task"}, validateFn: func(t *testing.T, client *api.Client, allocs []*api.AllocationListStub, stdout string, stderr string) { // Expect that only allocs for the single_task group were // rescheduled. reschedules := map[string]bool{} for _, alloc := range allocs { if alloc.TaskGroup == "single_task" { reschedules[alloc.ID] = true } } waitAllocsRescheduled(t, client, reschedules) // Check that allocations restarted in a single batch. must.StrContains(t, stdout, "Restarting 1st batch") must.StrNotContains(t, stdout, "restarting the next batch") }, }, } for _, tc := range testCases { tc := tc t.Run(tc.name, func(t *testing.T) { // Run each test case in parallel because they are fairly slow. ci.Parallel(t) // Initialize UI and command. ui := cli.NewMockUi() cmd := &JobRestartCommand{Meta: Meta{Ui: ui}} // Start client and server and wait for node to be ready. // User separate cluster for each test case so they can run in // parallel without affecting each other. srv, client, url := testServer(t, true, nil) defer srv.Shutdown() waitForNodes(t, client) // Register test job and wait for its allocs to be running. resp, _, err := client.Jobs().Register(job, nil) must.NoError(t, err) code := waitForSuccess(ui, client, fullId, t, resp.EvalID) must.Zero(t, code) allocStubs, _, err := client.Jobs().Allocations(jobID, true, nil) must.NoError(t, err) for _, alloc := range allocStubs { waitForAllocRunning(t, client, alloc.ID) } // Fetch allocations before the restart so we know which ones are // supposed to be affected in case the test reschedules allocs. allocStubs, _, err = client.Jobs().Allocations(jobID, true, nil) must.NoError(t, err) // Prepend server URL and append job ID to the test case command. args := []string{"-address", url, "-yes"} args = append(args, tc.args...) args = append(args, jobID) // Run job restart command. code = cmd.Run(args) must.Eq(t, code, tc.expectedCode) // Run test case validation function. if tc.validateFn != nil { tc.validateFn(t, client, allocStubs, ui.OutputWriter.String(), ui.ErrorWriter.String()) } }) } } func TestJobRestartCommand_jobPrefixAndNamespace(t *testing.T) { ci.Parallel(t) ui := cli.NewMockUi() // Start client and server and wait for node to be ready. srv, client, url := testServer(t, true, nil) defer srv.Shutdown() waitForNodes(t, client) // Create non-default namespace. _, err := client.Namespaces().Register(&api.Namespace{Name: "prod"}, nil) must.NoError(t, err) // Register job with same name in both namespaces. evalIDs := []string{} jobDefault := testJob("test_job_restart") resp, _, err := client.Jobs().Register(jobDefault, nil) must.NoError(t, err) evalIDs = append(evalIDs, resp.EvalID) jobProd := testJob("test_job_restart") jobProd.Namespace = pointer.Of("prod") resp, _, err = client.Jobs().Register(jobProd, nil) must.NoError(t, err) evalIDs = append(evalIDs, resp.EvalID) jobUniqueProd := testJob("test_job_restart_prod_ns") jobUniqueProd.Namespace = pointer.Of("prod") resp, _, err = client.Jobs().Register(jobUniqueProd, nil) must.NoError(t, err) evalIDs = append(evalIDs, resp.EvalID) // Wait for evals to be processed. for _, evalID := range evalIDs { code := waitForSuccess(ui, client, fullId, t, evalID) must.Eq(t, 0, code) } ui.OutputWriter.Reset() testCases := []struct { name string args []string expectedErr string }{ { name: "prefix match in default namespace", args: []string{"test_job"}, }, { name: "invalid job", args: []string{"not-valid"}, expectedErr: "No job(s) with prefix or ID", }, { name: "prefix matches multiple jobs", args: []string{"-namespace", "prod", "test_job"}, expectedErr: "matched multiple jobs", }, { name: "prefix matches multiple jobs across namespaces", args: []string{"-namespace", "*", "test_job"}, expectedErr: "matched multiple jobs", }, { name: "unique prefix match across namespaces", args: []string{"-namespace", "*", "test_job_restart_prod"}, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { defer func() { ui.OutputWriter.Reset() ui.ErrorWriter.Reset() }() cmd := &JobRestartCommand{ Meta: Meta{Ui: &cli.ConcurrentUi{Ui: ui}}, } args := append([]string{"-address", url, "-yes"}, tc.args...) code := cmd.Run(args) if tc.expectedErr != "" { must.NonZero(t, code) must.StrContains(t, ui.ErrorWriter.String(), tc.expectedErr) } else { must.Zero(t, code) } }) } } func TestJobRestartCommand_noAllocs(t *testing.T) { ci.Parallel(t) ui := cli.NewMockUi() cmd := &JobRestartCommand{Meta: Meta{Ui: ui}} // Start client and server and wait for node to be ready. srv, client, url := testServer(t, true, nil) defer srv.Shutdown() waitForNodes(t, client) // Register test job with impossible constraint so it doesn't get allocs. jobID := "test_job_restart_no_allocs" job := testJob(jobID) job.Datacenters = []string{"invalid"} resp, _, err := client.Jobs().Register(job, nil) must.NoError(t, err) code := waitForSuccess(ui, client, fullId, t, resp.EvalID) must.Eq(t, 2, code) // Placement is expected to fail so exit code is not 0. ui.OutputWriter.Reset() // Run job restart command and expect it to exit without restarts. code = cmd.Run([]string{ "-address", url, "-yes", jobID, }) must.Zero(t, code) must.StrContains(t, ui.OutputWriter.String(), "No allocations to restart") } func TestJobRestartCommand_rescheduleFail(t *testing.T) { ci.Parallel(t) ui := cli.NewMockUi() cmd := &JobRestartCommand{Meta: Meta{Ui: ui}} // Start client and server and wait for node to be ready. srv, client, url := testServer(t, true, nil) defer srv.Shutdown() waitForNodes(t, client) // Register test job with 3 allocs. jobID := "test_job_restart_reschedule_fail" job := testJob(jobID) job.TaskGroups[0].Count = pointer.Of(3) resp, _, err := client.Jobs().Register(job, nil) must.NoError(t, err) code := waitForSuccess(ui, client, fullId, t, resp.EvalID) must.Zero(t, code) ui.OutputWriter.Reset() // Wait for allocs to be running. allocs, _, err := client.Jobs().Allocations(jobID, true, nil) must.NoError(t, err) for _, alloc := range allocs { waitForAllocRunning(t, client, alloc.ID) } // Mark node as ineligible to prevent allocs from being replaced. nodeID := srv.Agent.Client().NodeID() client.Nodes().ToggleEligibility(nodeID, false, nil) // Run job restart command and expect it to fail. code = cmd.Run([]string{ "-address", url, "-batch-size", "2", "-reschedule", "-yes", jobID, }) must.One(t, code) must.StrContains(t, ui.ErrorWriter.String(), "No nodes were eligible for evaluation") } func TestJobRestartCommand_monitorReplacementAlloc(t *testing.T) { ci.Parallel(t) ui := cli.NewMockUi() cmd := &JobRestartCommand{Meta: Meta{Ui: ui}} srv, client, _ := testServer(t, true, nil) defer srv.Shutdown() waitForNodes(t, client) // Register test job and update it twice so we end up with three // allocations, one replacing the next one. jobID := "test_job_restart_monitor_replacement" job := testJob(jobID) for i := 1; i <= 3; i++ { job.TaskGroups[0].Tasks[0].Config["run_for"] = fmt.Sprintf("%ds", i) resp, _, err := client.Jobs().Register(job, nil) must.NoError(t, err) code := waitForSuccess(ui, client, fullId, t, resp.EvalID) must.Zero(t, code) } ui.OutputWriter.Reset() // Prepare the command internals. We want to run a specific function and // target a specific allocation, so we can't run the full command. cmd.client = client cmd.verbose = true cmd.length = fullId // Fetch, sort, and monitor the oldest allocation. allocs, _, err := client.Jobs().Allocations(jobID, true, nil) must.NoError(t, err) sort.Slice(allocs, func(i, j int) bool { return allocs[i].CreateIndex < allocs[j].CreateIndex }) errCh := make(chan error) go cmd.monitorReplacementAlloc(context.Background(), AllocationListStubWithJob{ AllocationListStub: allocs[0], Job: job, }, errCh) // Make sure the command doesn't get stuck and that we traverse the // follow-up allocations properly. must.Wait(t, wait.InitialSuccess( wait.ErrorFunc(func() error { select { case err := <-errCh: return err default: return fmt.Errorf("waiting for response") } }), wait.Timeout(time.Duration(testutil.TestMultiplier()*3)*time.Second), )) must.StrContains(t, ui.OutputWriter.String(), fmt.Sprintf("%q replaced by %q", allocs[0].ID, allocs[1].ID)) must.StrContains(t, ui.OutputWriter.String(), fmt.Sprintf("%q replaced by %q", allocs[1].ID, allocs[2].ID)) must.StrContains(t, ui.OutputWriter.String(), fmt.Sprintf("%q is %q", allocs[2].ID, api.AllocClientStatusRunning)) } func TestJobRestartCommand_activeDeployment(t *testing.T) { ci.Parallel(t) srv, client, url := testServer(t, true, nil) defer srv.Shutdown() waitForNodes(t, client) // Register test job and update it once to trigger a deployment. jobID := "test_job_restart_deployment" job := testJob(jobID) job.Type = pointer.Of(api.JobTypeService) job.Update = &api.UpdateStrategy{ Canary: pointer.Of(1), AutoPromote: pointer.Of(false), } _, _, err := client.Jobs().Register(job, nil) must.NoError(t, err) _, _, err = client.Jobs().Register(job, nil) must.NoError(t, err) // Wait for a deployment to be running. must.Wait(t, wait.InitialSuccess( wait.ErrorFunc(func() error { deployments, _, err := client.Jobs().Deployments(jobID, true, nil) if err != nil { return err } for _, d := range deployments { if d.Status == api.DeploymentStatusRunning { return nil } } return fmt.Errorf("no running deployments") }), wait.Timeout(time.Duration(testutil.TestMultiplier()*3)*time.Second), )) // Run job restart command and expect it to fail. ui := cli.NewMockUi() cmd := &JobRestartCommand{Meta: Meta{Ui: ui}} code := cmd.Run([]string{ "-address", url, "-on-error", jobRestartOnErrorFail, "-verbose", jobID, }) must.One(t, code) must.RegexMatch(t, regexp.MustCompile(`Deployment .+ is "running"`), ui.ErrorWriter.String()) } func TestJobRestartCommand_ACL(t *testing.T) { ci.Parallel(t) // Start server with ACL enabled. srv, client, url := testServer(t, true, func(c *agent.Config) { c.ACL.Enabled = true }) defer srv.Shutdown() rootTokenOpts := &api.WriteOptions{ AuthToken: srv.RootToken.SecretID, } // Register test job. jobID := "test_job_restart_acl" job := testJob(jobID) _, _, err := client.Jobs().Register(job, rootTokenOpts) must.NoError(t, err) // Wait for allocs to be running. waitForJobAllocsStatus(t, client, jobID, api.AllocClientStatusRunning, srv.RootToken.SecretID) testCases := []struct { name string jobPrefix bool aclPolicy string expectedErr string }{ { name: "no token", aclPolicy: "", expectedErr: api.PermissionDeniedErrorContent, }, { name: "alloc-lifecycle not enough", aclPolicy: ` namespace "default" { capabilities = ["alloc-lifecycle"] } `, expectedErr: api.PermissionDeniedErrorContent, }, { name: "read-job not enough", aclPolicy: ` namespace "default" { capabilities = ["read-job"] } `, expectedErr: api.PermissionDeniedErrorContent, }, { name: "alloc-lifecycle and read-job allowed", aclPolicy: ` namespace "default" { capabilities = ["alloc-lifecycle", "read-job"] } `, }, { name: "job prefix requires list-jobs", aclPolicy: ` namespace "default" { capabilities = ["alloc-lifecycle", "read-job"] } `, jobPrefix: true, expectedErr: "job not found", }, { name: "job prefix works with list-jobs", aclPolicy: ` namespace "default" { capabilities = ["list-jobs", "alloc-lifecycle", "read-job"] } `, jobPrefix: true, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { ui := cli.NewMockUi() cmd := &JobRestartCommand{Meta: Meta{Ui: ui}} args := []string{ "-address", url, "-yes", } if tc.aclPolicy != "" { // Create ACL token with test case policy. policy := &api.ACLPolicy{ Name: nonAlphaNum.ReplaceAllString(tc.name, "-"), Rules: tc.aclPolicy, } _, err := client.ACLPolicies().Upsert(policy, rootTokenOpts) must.NoError(t, err) token := &api.ACLToken{ Type: "client", Policies: []string{policy.Name}, } token, _, err = client.ACLTokens().Create(token, rootTokenOpts) must.NoError(t, err) // Set token in command args. args = append(args, "-token", token.SecretID) } // Add job ID or job ID prefix to the command. if tc.jobPrefix { args = append(args, jobID[0:3]) } else { args = append(args, jobID) } // Run command. code := cmd.Run(args) if tc.expectedErr == "" { must.Zero(t, code) } else { must.One(t, code) must.StrContains(t, ui.ErrorWriter.String(), tc.expectedErr) } }) } } // TODO(luiz): update once alloc restart supports -no-shutdown-delay. func TestJobRestartCommand_shutdownDelay_reschedule(t *testing.T) { ci.Parallel(t) // Start client and server and wait for node to be ready. srv, client, url := testServer(t, true, nil) defer srv.Shutdown() waitForNodes(t, client) testCases := []struct { name string args []string shutdownDelay bool }{ { name: "job reschedule with shutdown delay by default", args: []string{"-reschedule"}, shutdownDelay: true, }, { name: "job reschedule no shutdown delay", args: []string{"-reschedule", "-no-shutdown-delay"}, shutdownDelay: false, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { ui := cli.NewMockUi() cmd := &JobRestartCommand{Meta: Meta{Ui: ui}} // Register job with 2 allocations and shutdown_delay. shutdownDelay := 3 * time.Second jobID := nonAlphaNum.ReplaceAllString(tc.name, "-") job := testJob(jobID) job.TaskGroups[0].Count = pointer.Of(2) job.TaskGroups[0].Tasks[0].Config["run_for"] = "10m" job.TaskGroups[0].Tasks[0].ShutdownDelay = shutdownDelay job.TaskGroups[0].Tasks[0].Services = []*api.Service{{ Name: "service", Provider: "nomad", }} resp, _, err := client.Jobs().Register(job, nil) must.NoError(t, err) code := waitForSuccess(ui, client, fullId, t, resp.EvalID) must.Zero(t, code) ui.OutputWriter.Reset() // Wait for alloc to be running. allocStubs, _, err := client.Jobs().Allocations(jobID, true, nil) must.NoError(t, err) for _, alloc := range allocStubs { waitForAllocRunning(t, client, alloc.ID) } // Add address and job ID to the command and run. args := []string{ "-address", url, "-batch-size", "1", "-batch-wait", "0", "-yes", } args = append(args, tc.args...) args = append(args, jobID) code = cmd.Run(args) must.Zero(t, code) // Wait for all allocs to restart. reschedules := map[string]bool{} for _, alloc := range allocStubs { reschedules[alloc.ID] = true } allocs := waitAllocsRescheduled(t, client, reschedules) // Check that allocs have shutdown delay event. for _, alloc := range allocs { for _, s := range alloc.TaskStates { var killedEv *api.TaskEvent var killingEv *api.TaskEvent for _, ev := range s.Events { if strings.Contains(ev.Type, "Killed") { killedEv = ev } if strings.Contains(ev.Type, "Killing") { killingEv = ev } } diff := killedEv.Time - killingEv.Time if tc.shutdownDelay { must.GreaterEq(t, shutdownDelay, time.Duration(diff)) } else { // Add a bit of slack to account for the actual // shutdown time of the task. must.Between(t, shutdownDelay, time.Duration(diff), shutdownDelay+time.Second) } } } }) } } func TestJobRestartCommand_filterAllocs(t *testing.T) { ci.Parallel(t) task1 := api.NewTask("task_1", "mock_driver") task2 := api.NewTask("task_2", "mock_driver") task3 := api.NewTask("task_3", "mock_driver") jobV1 := api.NewServiceJob("example", "example", "global", 1). AddTaskGroup( api.NewTaskGroup("group_1", 1). AddTask(task1), ). AddTaskGroup( api.NewTaskGroup("group_2", 1). AddTask(task1). AddTask(task2), ). AddTaskGroup( api.NewTaskGroup("group_3", 1). AddTask(task3), ) jobV1.Version = pointer.Of(uint64(1)) jobV2 := api.NewServiceJob("example", "example", "global", 1). AddTaskGroup( api.NewTaskGroup("group_1", 1). AddTask(task1), ). AddTaskGroup( api.NewTaskGroup("group_2", 1). AddTask(task2), ) jobV2.Version = pointer.Of(uint64(2)) allAllocs := []AllocationListStubWithJob{} allocs := map[string]AllocationListStubWithJob{} for _, job := range []*api.Job{jobV1, jobV2} { for _, tg := range job.TaskGroups { for _, desired := range []string{api.AllocDesiredStatusRun, api.AllocDesiredStatusStop} { for _, client := range []string{api.AllocClientStatusRunning, api.AllocClientStatusComplete} { key := fmt.Sprintf("job_v%d_%s_%s_%s", *job.Version, *tg.Name, desired, client) alloc := AllocationListStubWithJob{ AllocationListStub: &api.AllocationListStub{ ID: key, JobVersion: *job.Version, TaskGroup: *tg.Name, DesiredStatus: desired, ClientStatus: client, }, Job: job, } allocs[key] = alloc allAllocs = append(allAllocs, alloc) } } } } testCases := []struct { name string args []string expectedAllocs []AllocationListStubWithJob }{ { name: "skip by group", args: []string{"-group", "group_1"}, expectedAllocs: []AllocationListStubWithJob{ allocs["job_v1_group_1_run_running"], allocs["job_v1_group_1_run_complete"], allocs["job_v1_group_1_stop_running"], allocs["job_v2_group_1_run_running"], allocs["job_v2_group_1_run_complete"], allocs["job_v2_group_1_stop_running"], }, }, { name: "skip by old group", args: []string{"-group", "group_3"}, expectedAllocs: []AllocationListStubWithJob{ allocs["job_v1_group_3_run_running"], allocs["job_v1_group_3_run_complete"], allocs["job_v1_group_3_stop_running"], }, }, { name: "skip by task", args: []string{"-task", "task_2"}, expectedAllocs: []AllocationListStubWithJob{ allocs["job_v1_group_2_run_running"], allocs["job_v1_group_2_run_complete"], allocs["job_v1_group_2_stop_running"], allocs["job_v2_group_2_run_running"], allocs["job_v2_group_2_run_complete"], allocs["job_v2_group_2_stop_running"], }, }, { name: "skip by old task", args: []string{"-task", "task_3"}, expectedAllocs: []AllocationListStubWithJob{ allocs["job_v1_group_3_run_running"], allocs["job_v1_group_3_run_complete"], allocs["job_v1_group_3_stop_running"], }, }, { name: "skip by group and task", args: []string{ "-group", "group_1", "-group", "group_2", "-task", "task_2", }, // Only group_2 has task_2 in all job versions. expectedAllocs: []AllocationListStubWithJob{ allocs["job_v1_group_2_run_running"], allocs["job_v1_group_2_run_complete"], allocs["job_v1_group_2_stop_running"], allocs["job_v2_group_2_run_running"], allocs["job_v2_group_2_run_complete"], allocs["job_v2_group_2_stop_running"], }, }, { name: "skip by status", args: []string{}, expectedAllocs: []AllocationListStubWithJob{ allocs["job_v1_group_1_run_running"], allocs["job_v1_group_1_run_complete"], allocs["job_v1_group_1_stop_running"], allocs["job_v1_group_2_run_running"], allocs["job_v1_group_2_run_complete"], allocs["job_v1_group_2_stop_running"], allocs["job_v1_group_3_run_running"], allocs["job_v1_group_3_run_complete"], allocs["job_v1_group_3_stop_running"], allocs["job_v2_group_1_run_running"], allocs["job_v2_group_1_run_complete"], allocs["job_v2_group_1_stop_running"], allocs["job_v2_group_2_run_running"], allocs["job_v2_group_2_run_complete"], allocs["job_v2_group_2_stop_running"], }, }, { name: "no matches by group", args: []string{"-group", "group_404"}, expectedAllocs: []AllocationListStubWithJob{}, }, { name: "no matches by task", args: []string{"-task", "task_404"}, expectedAllocs: []AllocationListStubWithJob{}, }, { name: "no matches by task with group", args: []string{ "-group", "group_1", "-task", "task_2", // group_1 never has task_2. }, expectedAllocs: []AllocationListStubWithJob{}, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { ui := cli.NewMockUi() cmd := &JobRestartCommand{ Meta: Meta{Ui: &cli.ConcurrentUi{Ui: ui}}, } args := append(tc.args, "-verbose", "-yes", "example") code, err := cmd.parseAndValidate(args) must.NoError(t, err) must.Zero(t, code) got := cmd.filterAllocs(allAllocs) must.SliceEqFunc(t, tc.expectedAllocs, got, func(a, b AllocationListStubWithJob) bool { return a.ID == b.ID }) expected := set.FromFunc(tc.expectedAllocs, func(a AllocationListStubWithJob) string { return a.ID }) for _, a := range allAllocs { if !expected.Contains(a.ID) { must.StrContains(t, ui.OutputWriter.String(), fmt.Sprintf("Skipping allocation %q", a.ID)) } } }) } } func TestJobRestartCommand_onErrorFail(t *testing.T) { ci.Parallel(t) ui := cli.NewMockUi() cmd := &JobRestartCommand{Meta: Meta{Ui: ui}} // Start client and server and wait for node to be ready. srv, client, url := testServer(t, true, nil) defer srv.Shutdown() parsedURL, err := neturl.Parse(url) must.NoError(t, err) waitForNodes(t, client) // Register a job with 3 allocations. jobID := "test_job_restart_command_fail_on_error" job := testJob(jobID) job.TaskGroups[0].Count = pointer.Of(3) resp, _, err := client.Jobs().Register(job, nil) must.NoError(t, err) code := waitForSuccess(ui, client, fullId, t, resp.EvalID) must.Zero(t, code) ui.OutputWriter.Reset() // Create a proxy to inject an error after 2 allocation restarts. // Also counts how many restart requests are made so we can check that the // command stops after the error happens. var allocRestarts int32 proxy := httptest.NewServer(&httputil.ReverseProxy{ ModifyResponse: func(resp *http.Response) error { if strings.HasSuffix(resp.Request.URL.Path, "/restart") { count := atomic.AddInt32(&allocRestarts, 1) if count == 2 { return fmt.Errorf("fail") } } return nil }, Rewrite: func(r *httputil.ProxyRequest) { r.SetURL(parsedURL) }, }) defer proxy.Close() // Run command with -fail-on-error. // Expect only 2 restarts requests even though there are 3 allocations. code = cmd.Run([]string{ "-address", proxy.URL, "-on-error", jobRestartOnErrorFail, jobID, }) must.One(t, code) must.Eq(t, 2, allocRestarts) } // waitTasksRestarted blocks until the given allocations have restarted or not. // Returns a list with updated state of the allocations. // // To determine if a restart happened the function looks for a "Restart // Signaled" event in the list of task events. Allocations that are reused // between tests may contain a restart event from a past test case, leading to // false positives. // // The restarts map contains values structured as group:task:. func waitTasksRestarted( t *testing.T, client *api.Client, allocs []*api.AllocationListStub, restarts map[string]map[string]bool, ) []*api.Allocation { t.Helper() var newAllocs []*api.Allocation testutil.WaitForResult(func() (bool, error) { newAllocs = make([]*api.Allocation, 0, len(allocs)) for _, alloc := range allocs { if _, ok := restarts[alloc.TaskGroup]; !ok { t.Fatalf("Missing group %q in restarts map", alloc.TaskGroup) } // Skip allocations that are not supposed to be running. if alloc.DesiredStatus != api.AllocDesiredStatusRun { continue } updated, _, err := client.Allocations().Info(alloc.ID, nil) if err != nil { return false, err } newAllocs = append(newAllocs, updated) for task, state := range updated.TaskStates { restarted := false for _, ev := range state.Events { if ev.Type == api.TaskRestartSignal { restarted = true break } } if restarted && !restarts[updated.TaskGroup][task] { return false, fmt.Errorf( "task %q in alloc %s for group %q not expected to restart", task, updated.ID, updated.TaskGroup, ) } if !restarted && restarts[updated.TaskGroup][task] { return false, fmt.Errorf( "task %q in alloc %s for group %q expected to restart but didn't", task, updated.ID, updated.TaskGroup, ) } } } return true, nil }, func(err error) { must.NoError(t, err) }) return newAllocs } // waitAllocsRescheduled blocks until the given allocations have been // rescueduled or not. Returns a list with updated state of the allocations. // // To determined if an allocation has been rescheduled the function looks for // a non-empty NextAllocation field. // // The reschedules map maps allocation IDs to a boolean indicating if a // reschedule is expected for that allocation. func waitAllocsRescheduled(t *testing.T, client *api.Client, reschedules map[string]bool) []*api.Allocation { t.Helper() var newAllocs []*api.Allocation testutil.WaitForResult(func() (bool, error) { newAllocs = make([]*api.Allocation, 0, len(reschedules)) for allocID, reschedule := range reschedules { alloc, _, err := client.Allocations().Info(allocID, nil) if err != nil { return false, err } newAllocs = append(newAllocs, alloc) wasRescheduled := alloc.NextAllocation != "" if wasRescheduled && !reschedule { return false, fmt.Errorf("alloc %s not expected to be rescheduled", alloc.ID) } if !wasRescheduled && reschedule { return false, fmt.Errorf("alloc %s expected to be rescheduled but wasn't", alloc.ID) } } return true, nil }, func(err error) { must.NoError(t, err) }) return newAllocs } // getRestartBatches returns a list of allocations per batch of restarts. // // Since restarts are issued concurrently, it's expected that allocations in // the same batch have fairly close LastRestart times, so a 1s delay between // restarts may be enough to indicate a new batch. func getRestartBatches(allocs []*api.Allocation, groups []string, task string) [][]*api.Allocation { groupsSet := set.From(groups) batches := [][]*api.Allocation{} type allocRestart struct { alloc *api.Allocation restart time.Time } restarts := make([]allocRestart, 0, len(allocs)) for _, alloc := range allocs { if !groupsSet.Contains(alloc.TaskGroup) { continue } restarts = append(restarts, allocRestart{ alloc: alloc, restart: alloc.TaskStates[task].LastRestart, }) } sort.Slice(restarts, func(i, j int) bool { return restarts[i].restart.Before(restarts[j].restart) }) prev := restarts[0].restart batch := []*api.Allocation{} for _, r := range restarts { if r.restart.Sub(prev) >= time.Second { prev = r.restart batches = append(batches, batch) batch = []*api.Allocation{} } batch = append(batch, r.alloc) } batches = append(batches, batch) return batches }