cli: job restart command (#16278)

Implement the new `nomad job restart` command that allows operators to
restart allocations tasks or reschedule then entire allocation.

Restarts can be batched to target multiple allocations in parallel.
Between each batch the command can stop and hold for a predefined time
or until the user confirms that the process should proceed.

This implements the "Stateless Restarts" alternative from the original
RFC
(https://gist.github.com/schmichael/e0b8b2ec1eb146301175fd87ddd46180).
The original concept is still worth implementing, as it allows this
functionality to be exposed over an API that can be consumed by the
Nomad UI and other clients. But the implementation turned out to be more
complex than we initially expected so we thought it would be better to
release a stateless CLI-based implementation first to gather feedback
and validate the restart behaviour.

Co-authored-by: Shishir Mahajan <smahajan@roblox.com>
This commit is contained in:
Luiz Aoqui 2023-03-23 18:28:26 -04:00 committed by GitHub
parent 4ccd999304
commit e5d31bca61
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 3119 additions and 5 deletions

3
.changelog/16278.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:improvement
cli: Added new `nomad job restart` command to restart all allocations for a job
```

View File

@ -976,6 +976,12 @@ func (t *Task) SetLogConfig(l *LogConfig) *Task {
return t
}
// SetLifecycle is used to set lifecycle config to a task.
func (t *Task) SetLifecycle(l *TaskLifecycle) *Task {
t.Lifecycle = l
return t
}
// TaskState tracks the current state of a task and events that caused state
// transitions.
type TaskState struct {

View File

@ -415,6 +415,14 @@ func Commands(metaPtr *Meta, agentUi cli.Ui) map[string]cli.CommandFactory {
Meta: meta,
}, nil
},
"job restart": func() (cli.Command, error) {
// Use a *cli.ConcurrentUi because this command spawns several
// goroutines that write to the terminal concurrently.
meta.Ui = &cli.ConcurrentUi{Ui: meta.Ui}
return &JobRestartCommand{
Meta: meta,
}, nil
},
"job deployments": func() (cli.Command, error) {
return &JobDeploymentsCommand{
Meta: meta,

View File

@ -62,6 +62,13 @@ func limit(s string, length int) string {
return s[:length]
}
// indentString returns the string s padded with the given number of empty
// spaces before each line except for the first one.
func indentString(s string, pad int) string {
prefix := strings.Repeat(" ", pad)
return strings.Join(strings.Split(s, "\n"), fmt.Sprintf("\n%s", prefix))
}
// wrapAtLengthWithPadding wraps the given text at the maxLineLength, taking
// into account any provided left padding.
func wrapAtLengthWithPadding(s string, pad int) string {

1205
command/job_restart.go Normal file

File diff suppressed because it is too large Load Diff

1591
command/job_restart_test.go Normal file

File diff suppressed because it is too large Load Diff

View File

@ -4,6 +4,7 @@ import (
"flag"
"fmt"
"os"
"reflect"
"strings"
"github.com/hashicorp/nomad/api"
@ -176,7 +177,35 @@ func (m *Meta) allNamespaces() bool {
}
func (m *Meta) Colorize() *colorstring.Colorize {
_, coloredUi := m.Ui.(*cli.ColoredUi)
ui := m.Ui
coloredUi := false
// Meta.Ui may wrap other cli.Ui instances, so unwrap them until we find a
// *cli.ColoredUi or there is nothing left to unwrap.
for {
if ui == nil {
break
}
_, coloredUi = ui.(*cli.ColoredUi)
if coloredUi {
break
}
v := reflect.ValueOf(ui)
if v.Kind() == reflect.Ptr {
v = v.Elem()
}
for i := 0; i < v.NumField(); i++ {
if !v.Field(i).CanInterface() {
continue
}
ui, _ = v.Field(i).Interface().(cli.Ui)
if ui != nil {
break
}
}
}
return &colorstring.Colorize{
Colors: colorstring.DefaultColors,

View File

@ -152,21 +152,48 @@ func waitForNodes(t *testing.T, client *api.Client) {
})
}
func waitForAllocRunning(t *testing.T, client *api.Client, allocID string) {
func waitForJobAllocsStatus(t *testing.T, client *api.Client, jobID string, status string, token string) {
testutil.WaitForResult(func() (bool, error) {
q := &api.QueryOptions{AuthToken: token}
allocs, _, err := client.Jobs().Allocations(jobID, true, q)
if err != nil {
return false, fmt.Errorf("failed to query job allocs: %v", err)
}
if len(allocs) == 0 {
return false, fmt.Errorf("no allocs")
}
for _, alloc := range allocs {
if alloc.ClientStatus != status {
return false, fmt.Errorf("alloc status is %q not %q", alloc.ClientStatus, status)
}
}
return true, nil
}, func(err error) {
must.NoError(t, err)
})
}
func waitForAllocStatus(t *testing.T, client *api.Client, allocID string, status string) {
testutil.WaitForResult(func() (bool, error) {
alloc, _, err := client.Allocations().Info(allocID, nil)
if err != nil {
return false, err
}
if alloc.ClientStatus == api.AllocClientStatusRunning {
if alloc.ClientStatus == status {
return true, nil
}
return false, fmt.Errorf("alloc status: %s", alloc.ClientStatus)
return false, fmt.Errorf("alloc status is %q not %q", alloc.ClientStatus, status)
}, func(err error) {
t.Fatalf("timed out waiting for alloc to be running: %v", err)
must.NoError(t, err)
})
}
func waitForAllocRunning(t *testing.T, client *api.Client, allocID string) {
waitForAllocStatus(t, client, allocID, api.AllocClientStatusRunning)
}
func waitForCheckStatus(t *testing.T, client *api.Client, allocID, status string) {
testutil.WaitForResult(func() (bool, error) {
results, err := client.Allocations().Checks(allocID, nil)

View File

@ -0,0 +1,234 @@
---
layout: docs
page_title: 'Commands: job restart'
description: |
The job restart command is used to restart allocations for a job.
---
# Command: job restart
The `job restart` command is used to restart or reschedule allocations for a
particular job.
Restarting the job calls the [Restart Allocation][api_alloc_restart] API
endpoint to restart the tasks inside allocations, so the allocations themselves
are not modified but rather restarted in-place.
Rescheduling the job uses the [Stop Allocation][api_alloc_stop] API endpoint to
stop the allocations and trigger the Nomad scheduler to compute new placements.
This may cause the new allocations to be scheduled in different clients from
the originals.
## Usage
```plaintext
nomad job restart [options] <job>
```
The `job restart` command requires a single argument, specifying the job ID to
restart.
The command can operate in batches and wait until all restarted or
rescheduled allocations are running again before proceeding to the next batch.
It is also possible to specify additional time to wait between batches.
Allocations can be restarted in-place or rescheduled. When restarting
in-place the command may target specific tasks in the allocations, restart
only tasks that are currently running, or restart all tasks, even the ones
that have already run. Allocations can also be targeted by groups and tasks.
When both groups and tasks are defined only the tasks for the allocations of
those groups are restarted.
When rescheduling, the current allocations are stopped triggering the Nomad
scheduler to create replacement allocations that may be placed in different
clients. The command waits until the new allocations have client status `ready`
before proceeding with the remaining batches. Services health checks are not
taken into account.
By default the command restarts all running tasks in-place with one allocation
per batch.
When ACLs are enabled, this command requires a token with the
`alloc-lifecycle` and `read-job` capabilities for the job's namespace. The
`list-jobs` capability is required to run the command with a job prefix instead
of the exact job ID.
## General Options
@include 'general_options.mdx'
## Restart Options
- `-all-tasks`: If set, all tasks in the allocations are restarted, even the
ones that have already run, such as non-sidecar tasks. Tasks will restart
following their [`lifecycle`][] order. This option cannot be used with
`-task`.
- `-batch-size=<n|n%>`: Number of allocations to restart at once. It may be
defined as a percentage value of the current number of running allocations.
Percentage values are rounded up to increase parallelism. Defaults to `1`.
- `-batch-wait=<duration|ask>`: Time to wait between restart batches. If set
to `ask` the command halts between batches and waits for user input on how to
proceed. If the answer is a time duration all remaining batches will use this
new value. Defaults to `0`.
- `-group=<group-name>`: Only restart allocations for the given group. Can be
specified multiple times. If no group is set all allocations for the job are
restarted.
- `-no-shutdown-delay`: Ignore the group and task [`shutdown_delay`][]
configuration so there is no delay between service deregistration and task
shutdown or restart. Note that using this flag will result in failed network
connections to the allocation being restarted.
- `-reschedule`: If set, allocations are stopped and rescheduled instead of
restarted in-place. Since the group is not modified the restart does not
create a new deployment, and so values defined in [`update`][] blocks, such
as [`max_parallel`][], are not taken into account. This option cannot be used
with `-task`.
- `-on-error=<ask|fail>`: Determines what action to take when an error happens
during a restart batch. If `ask` the command stops and waits for user
confirmation on how to proceed. If `fail` the command exits immediately.
Defaults to `ask`.
- `-task=<task-name>`: Specify the task to restart. Can be specified multiple
times. If groups are also specified the task must exist in at least one of
them. If no task is set only tasks that are currently running are restarted.
For example, non-sidecar tasks that already ran are not restarted unless
`-all-tasks` is used instead. This option cannot be used with `-all-tasks` or
`-reschedule`.
- `-yes`: Automatic yes to prompts. If set, the command automatically restarts
multi-region jobs only in the region targeted by the command, ignores batch
errors, and automatically proceeds with the remaining batches without
waiting. Use `-on-error` and `-batch-wait` to adjust these behaviors.
- `-verbose`: Display full information.
## Examples
Restart running tasks of all allocations.
```shell-session
$ nomad job restart example
==> 2023-02-28T17:36:31-05:00: Restarting 5 allocations
2023-02-28T17:36:31-05:00: Restarting running tasks in allocation "32e143f8" for group "proxy"
2023-02-28T17:36:31-05:00: Restarting running tasks in allocation "388129e0" for group "web"
2023-02-28T17:36:31-05:00: Restarting running tasks in allocation "4fd581ee" for group "proxy"
2023-02-28T17:36:32-05:00: Restarting running tasks in allocation "77d5c4f6" for group "proxy"
2023-02-28T17:36:32-05:00: Restarting running tasks in allocation "d4303a30" for group "web"
==> 2023-02-28T17:36:32-05:00: Finished job restart
All allocations restarted successfully!
```
Target allocations of a specific group to restart.
```shell-session
$ nomad job restart -group=web example
==> 2023-02-28T17:37:36-05:00: Restarting 2 allocations
2023-02-28T17:37:36-05:00: Restarting running tasks in allocation "388129e0" for group "web"
2023-02-28T17:37:37-05:00: Restarting running tasks in allocation "d4303a30" for group "web"
==> 2023-02-28T17:37:37-05:00: Finished job restart
All allocations restarted successfully!
```
Reschedule allocations instead of restarting them in-place.
```shell-session
nomad job restart -group=web -reschedule example
==> 2023-02-28T17:39:14-05:00: Restarting 2 allocations
2023-02-28T17:39:14-05:00: Rescheduling allocation "388129e0" for group "web"
2023-02-28T17:39:45-05:00: Rescheduling allocation "d4303a30" for group "web"
==> 2023-02-28T17:40:16-05:00: Finished job restart
All allocations restarted successfully!
```
Batch allocations to restart them 2 at a time.
```shell-session
$ nomad job restart -batch-size=2 example
==> 2023-02-28T17:40:58-05:00: Restarting 5 allocations
==> 2023-02-28T17:40:58-05:00: Restarting 1st batch of 2 allocations
2023-02-28T17:40:58-05:00: Restarting running tasks in allocation "653f983e" for group "web"
2023-02-28T17:40:58-05:00: Restarting running tasks in allocation "4d18e545" for group "web"
==> 2023-02-28T17:40:58-05:00: Restarting 2nd batch of 2 allocations
2023-02-28T17:40:58-05:00: Restarting running tasks in allocation "32e143f8" for group "proxy"
2023-02-28T17:40:58-05:00: Restarting running tasks in allocation "4fd581ee" for group "proxy"
==> 2023-02-28T17:40:59-05:00: Restarting 3rd batch of 1 allocations
2023-02-28T17:40:59-05:00: Restarting running tasks in allocation "77d5c4f6" for group "proxy"
==> 2023-02-28T17:40:59-05:00: Finished job restart
All allocations restarted successfully!
```
Batch allocations as a percentage of total running allocations.
```shell-session
$ nomad job restart -batch-size=50% example
==> 2023-02-28T18:52:47-05:00: Restarting 5 allocations
==> 2023-02-28T18:52:47-05:00: Restarting 1st batch of 3 allocations
2023-02-28T18:52:47-05:00: Restarting running tasks in allocation "d28f6f60" for group "proxy"
2023-02-28T18:52:47-05:00: Restarting running tasks in allocation "b931b496" for group "proxy"
2023-02-28T18:52:47-05:00: Restarting running tasks in allocation "18673b40" for group "proxy"
==> 2023-02-28T18:52:48-05:00: Restarting 2nd batch of 2 allocations
2023-02-28T18:52:48-05:00: Restarting running tasks in allocation "439b1632" for group "web"
2023-02-28T18:52:48-05:00: Restarting running tasks in allocation "8fae60f6" for group "web"
==> 2023-02-28T18:52:48-05:00: Finished job restart
All allocations restarted successfully!
```
Pause between batches of restart and wait for user input on how to proceed.
```shell-session
$ nomad job restart -batch-size=2 -batch-wait=ask example
==> 2023-02-28T18:04:19-05:00: Restarting 5 allocations
==> 2023-02-28T18:04:19-05:00: Restarting 1st batch of 2 allocations
2023-02-28T18:04:19-05:00: Restarting running tasks in allocation "4d18e545" for group "web"
2023-02-28T18:04:19-05:00: Restarting running tasks in allocation "653f983e" for group "web"
==> 2023-02-28T18:04:19-05:00: Proceed with next batch? [Y/n/<duration>] y
==> 2023-02-28T18:04:20-05:00: Restarting 2nd batch of 2 allocations
2023-02-28T18:04:20-05:00: Restarting running tasks in allocation "4fd581ee" for group "proxy"
2023-02-28T18:04:20-05:00: Restarting running tasks in allocation "32e143f8" for group "proxy"
==> 2023-02-28T18:04:20-05:00: Proceed with next batch? [Y/n/<duration>] 10s
==> 2023-02-28T18:04:22-05:00: Proceeding restarts with new wait time of 10s
==> 2023-02-28T18:04:22-05:00: Waiting 10s before restarting the next batch
==> 2023-02-28T18:04:32-05:00: Restarting 3rd batch of 1 allocations
2023-02-28T18:04:32-05:00: Restarting running tasks in allocation "77d5c4f6" for group "proxy"
==> 2023-02-28T18:04:32-05:00: Finished job restart
All allocations restarted successfully!
```
Wait 10 seconds before each restart batch.
```shell-session
$ nomad job restart -batch-size=2 -batch-wait=10s example
==> 2023-02-28T18:03:43-05:00: Restarting 5 allocations
==> 2023-02-28T18:03:43-05:00: Restarting 1st batch of 2 allocations
2023-02-28T18:03:43-05:00: Restarting running tasks in allocation "653f983e" for group "web"
2023-02-28T18:03:43-05:00: Restarting running tasks in allocation "4d18e545" for group "web"
==> 2023-02-28T18:03:43-05:00: Waiting 10s before restarting the next batch
==> 2023-02-28T18:03:53-05:00: Restarting 2nd batch of 2 allocations
2023-02-28T18:03:53-05:00: Restarting running tasks in allocation "4fd581ee" for group "proxy"
2023-02-28T18:03:53-05:00: Restarting running tasks in allocation "32e143f8" for group "proxy"
==> 2023-02-28T18:03:53-05:00: Waiting 10s before restarting the next batch
==> 2023-02-28T18:04:03-05:00: Restarting 3rd batch of 1 allocations
2023-02-28T18:04:03-05:00: Restarting running tasks in allocation "77d5c4f6" for group "proxy"
==> 2023-02-28T18:04:03-05:00: Finished job restart
All allocations restarted successfully!
```
[`lifecycle`]: /nomad/docs/job-specification/lifecycle
[`max_parallel`]: /nomad/docs/job-specification/update#max_parallel
[`shutdown_delay`]: /nomad/docs/job-specification/task#shutdown_delay
[`update`]: /nomad/docs/job-specification/update
[api_alloc_restart]: /nomad/api-docs/allocations#restart-allocation
[api_alloc_stop]: /nomad/api-docs/allocations#stop-allocation

View File

@ -542,6 +542,10 @@
"title": "promote",
"path": "commands/job/promote"
},
{
"title": "restart",
"path": "commands/job/restart"
},
{
"title": "revert",
"path": "commands/job/revert"