From 7d246b56b7a0f71532094e143a0ed4d23046f114 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Thu, 12 Apr 2018 15:57:06 -0500
Subject: [PATCH] Added section on failure recovery under operating a job with
 details and examples of different restarts.

---
 .../check-restart.html.md                     | 23 +++++
 .../failure-handling-strategies/index.html.md | 25 +++++
 .../reschedule.html.md                        | 92 +++++++++++++++++++
 .../restart.html.md                           | 91 ++++++++++++++++++
 website/source/layouts/docs.erb               | 14 +++
 5 files changed, 245 insertions(+)
 create mode 100644 website/source/docs/operating-a-job/failure-handling-strategies/check-restart.html.md
 create mode 100644 website/source/docs/operating-a-job/failure-handling-strategies/index.html.md
 create mode 100644 website/source/docs/operating-a-job/failure-handling-strategies/reschedule.html.md
 create mode 100644 website/source/docs/operating-a-job/failure-handling-strategies/restart.html.md

diff --git a/website/source/docs/operating-a-job/failure-handling-strategies/check-restart.html.md b/website/source/docs/operating-a-job/failure-handling-strategies/check-restart.html.md
new file mode 100644
index 000000000..a8ae06c11
--- /dev/null
+++ b/website/source/docs/operating-a-job/failure-handling-strategies/check-restart.html.md
@@ -0,0 +1,23 @@
+---
+layout: "docs"
+page_title: "Check Restart Stanza - Operating a Job"
+sidebar_current: "docs-operating-a-job-failure-handling-strategies-check-restart"
+description: |-
+  Nomad can restart service job tasks if they have a failing health check based on
+  configuration specified in the `check_restart` stanza. Restarts are done locally on the node
+  running the task based on their `restart` policy.
+---
+
+# Check Restart Stanza
+
+The [`check_restart` stanza][check restart] instructs Nomad when to restart tasks with unhealthy service checks.
+When a health check in Consul has been unhealthy for the limit specified in a check_restart stanza,
+it is restarted according to the task group's restart policy.
+
+The `limit ` field is used to specify the number of times a failing healthcheck is seen before local restarts are attempted.
+Operators can also specify a `grace` duration to wait after a task restarts before checking its health.
+
+We recommend configuring the check restart on services if its likely that a restart would resolve the failure. This
+is applicable in cases like temporary memory issues on the service.
+
+[check restart]: /docs/job-specification/check_restart.html "Nomad check restart Stanza"
\ No newline at end of file
diff --git a/website/source/docs/operating-a-job/failure-handling-strategies/index.html.md b/website/source/docs/operating-a-job/failure-handling-strategies/index.html.md
new file mode 100644
index 000000000..087040c3c
--- /dev/null
+++ b/website/source/docs/operating-a-job/failure-handling-strategies/index.html.md
@@ -0,0 +1,25 @@
+---
+layout: "docs"
+page_title: "Handling Failures - Operating a Job"
+sidebar_current: "docs-operating-a-job-failure-handling-strategies"
+description: |-
+  This section describes features in Nomad that automate recovering from failed tasks.
+---
+
+# Failure Recovery Strategies
+
+Most applications deployed in Nomad are either long running services or one time batch jobs.
+They can fail for various reasons like:
+
+- A temporary error in the service that resolves when its restarted.
+- An upstream dependency might not be available, leading to a health check failure.
+- Disk, Memory or CPU contention on the node that the application is running on.
+- The application uses Docker and the Docker daemon on that node is no longer running.
+
+Nomad provides configurable options to enable recovering failed tasks to avoid downtime. Nomad will
+try to restart a failed task on the node it is running on, and also try to reschedule it on another node.
+Please see one of the guides below or use the navigation on the left for details on each option:
+
+1. [Local Restarts](/docs/operating-a-job/failure-handling-strategies/restart.html)
+1. [Check Restarts](/docs/operating-a-job/failure-handling-strategies/check-restart.html)
+1. [Rescheduling](/docs/operating-a-job/failure-handling-strategies/rescheduling.html)
diff --git a/website/source/docs/operating-a-job/failure-handling-strategies/reschedule.html.md b/website/source/docs/operating-a-job/failure-handling-strategies/reschedule.html.md
new file mode 100644
index 000000000..e3e3f2f80
--- /dev/null
+++ b/website/source/docs/operating-a-job/failure-handling-strategies/reschedule.html.md
@@ -0,0 +1,92 @@
+---
+layout: "docs"
+page_title: "Reschedule Stanza - Operating a Job"
+sidebar_current: "docs-operating-a-job-failure-handling-strategies-reschedule"
+description: |-
+  Nomad can reschedule failing tasks after any local restart attempts have been
+  exhausted. This is useful to recover from failures stemming from problems in the node
+  running the task.
+---
+
+# Reschedule Stanza
+
+Tasks can sometimes fail due to network, CPU or memory issues on the node running the task. In such situations,
+Nomad can reschedule the task on another node. The [`reschedule` stanza][reschedule] can be used to configure how
+Nomad should try placing failed tasks on another node in the cluster. Reschedule attempts have a delay between
+each attempt, and the delay can be configured to increase between each rescheduling attempt according to a configurable
+`delay-function`. See the [documentation][reschedule] for more information on all the options for rescheduling.
+
+Service jobs are configured by default to have unlimited reschedule attempts. We recommend using the reschedule
+stanza to ensure that failed tasks are automatically reattempted on another node without needing operator intervention.
+
+# Example
+The following CLI example shows job and allocation statuses for a task being rescheduled by Nomad.
+The CLI shows the number of previous attempts if there is a limit on the number of reschedule attempts.
+The CLI also shows when the next reschedule will be attempted.
+
+```text
+$nomad job status demo
+ID            = demo
+Name          = demo
+Submit Date   = 2018-04-12T15:48:37-05:00
+Type          = service
+Priority      = 50
+Datacenters   = dc1
+Status        = pending
+Periodic      = false
+Parameterized = false
+
+Summary
+Task Group  Queued  Starting  Running  Failed  Complete  Lost
+demo        0       0         0        2       0         0
+
+Future Rescheduling Attempts
+Task Group  Eval ID   Eval Time
+demo        ee3de93f  5s from now
+
+Allocations
+ID        Node ID   Task Group  Version  Desired  Status  Created  Modified
+39d7823d  f2c2eaa6  demo        0        run      failed  5s ago   5s ago
+fafb011b  f2c2eaa6  demo        0        run      failed  11s ago  10s ago
+
+```
+
+```text
+$nomad alloc status 3d0b
+ID                     = 3d0bbdb1
+Eval ID                = 79b846a9
+Name                   = demo.demo[0]
+Node ID                = 8a184f31
+Job ID                 = demo
+Job Version            = 0
+Client Status          = failed
+Client Description     = <none>
+Desired Status         = run
+Desired Description    = <none>
+Created                = 15s ago
+Modified               = 15s ago
+Reschedule Attempts    = 3/5
+Reschedule Eligibility = 25s from now
+
+Task "demo" is "dead"
+Task Resources
+CPU      Memory   Disk     IOPS  Addresses
+100 MHz  300 MiB  300 MiB  0     p1: 127.0.0.1:27646
+
+Task Events:
+Started At     = 2018-04-12T20:44:25Z
+Finished At    = 2018-04-12T20:44:25Z
+Total Restarts = 0
+Last Restart   = N/A
+
+Recent Events:
+Time                       Type            Description
+2018-04-12T15:44:25-05:00  Not Restarting  Policy allows no restarts
+2018-04-12T15:44:25-05:00  Terminated      Exit Code: 127
+2018-04-12T15:44:25-05:00  Started         Task started by client
+2018-04-12T15:44:25-05:00  Task Setup      Building Task Directory
+2018-04-12T15:44:25-05:00  Received        Task received by client
+
+```
+
+[reschedule]: /docs/job-specification/reschedule.html "Nomad reschedule Stanza"
\ No newline at end of file
diff --git a/website/source/docs/operating-a-job/failure-handling-strategies/restart.html.md b/website/source/docs/operating-a-job/failure-handling-strategies/restart.html.md
new file mode 100644
index 000000000..1cff939c8
--- /dev/null
+++ b/website/source/docs/operating-a-job/failure-handling-strategies/restart.html.md
@@ -0,0 +1,91 @@
+---
+layout: "docs"
+page_title: "Restart Stanza - Operating a Job"
+sidebar_current: "docs-operating-a-job-failure-handling-strategies-local-restarts"
+description: |-
+  Nomad can restart a task on the node it is running on to recover from
+  failures. Task restarts can be configured to be limited by number of attempts within
+  a specific interval.
+---
+
+# Restart Stanza
+
+To enable restarting a failed task on the node it is running on, the task group can be annotated
+with configurable options using the [`restart` stanza][restart]. Nomad will restart the failed task
+upto `attempts` times within a provided `interval`. Operators can also choose whether to
+keep attempting restarts on the same node, or to fail the task so that it can be rescheduled
+on another node, via the `mode` parameter.
+
+We recommend setting mode to `fail` in the restart stanza to allow rescheduling the task on another node.
+
+
+## Example
+The following CLI example shows job status and allocation status for a failed task that is being restarted by Nomad.
+Allocations are in the `pending` state while restarts are attempted. The `Recent Events` section in the CLI
+shows ongoing restart attempts.
+
+```text
+$nomad job status demo
+ID            = demo
+Name          = demo
+Submit Date   = 2018-04-12T14:37:18-05:00
+Type          = service
+Priority      = 50
+Datacenters   = dc1
+Status        = running
+Periodic      = false
+Parameterized = false
+
+Summary
+Task Group  Queued  Starting  Running  Failed  Complete  Lost
+demo        0       3         0        0       0         0
+
+Allocations
+ID        Node ID   Task Group  Version  Desired  Status   Created  Modified
+ce5bf1d1  8a184f31  demo        0        run      pending  27s ago  5s ago
+d5dee7c8  8a184f31  demo        0        run      pending  27s ago  5s ago
+ed815997  8a184f31  demo        0        run      pending  27s ago  5s ago
+```
+
+```text
+$nomad alloc-status ce5b
+ID                  = ce5bf1d1
+Eval ID             = 05681b90
+Name                = demo.demo[1]
+Node ID             = 8a184f31
+Job ID              = demo
+Job Version         = 0
+Client Status       = pending
+Client Description  = <none>
+Desired Status      = run
+Desired Description = <none>
+Created             = 31s ago
+Modified            = 9s ago
+
+Task "demo" is "pending"
+Task Resources
+CPU      Memory   Disk     IOPS  Addresses
+100 MHz  300 MiB  300 MiB  0
+
+Task Events:
+Started At     = 2018-04-12T19:37:40Z
+Finished At    = N/A
+Total Restarts = 3
+Last Restart   = 2018-04-12T14:37:40-05:00
+
+Recent Events:
+Time                       Type        Description
+2018-04-12T14:37:40-05:00  Restarting  Task restarting in 11.686056069s
+2018-04-12T14:37:40-05:00  Terminated  Exit Code: 127
+2018-04-12T14:37:40-05:00  Started     Task started by client
+2018-04-12T14:37:29-05:00  Restarting  Task restarting in 10.97348449s
+2018-04-12T14:37:29-05:00  Terminated  Exit Code: 127
+2018-04-12T14:37:29-05:00  Started     Task started by client
+2018-04-12T14:37:19-05:00  Restarting  Task restarting in 10.619985509s
+2018-04-12T14:37:19-05:00  Terminated  Exit Code: 127
+2018-04-12T14:37:19-05:00  Started     Task started by client
+2018-04-12T14:37:19-05:00  Task Setup  Building Task Directory
+```
+
+
+[restart]: /docs/job-specification/restart.html "Nomad restart Stanza"
diff --git a/website/source/layouts/docs.erb b/website/source/layouts/docs.erb
index 4fd16fcb7..7e3eacacc 100644
--- a/website/source/layouts/docs.erb
+++ b/website/source/layouts/docs.erb
@@ -132,6 +132,20 @@
               </li>
             </ul>
           </li>
+           <li<%= sidebar_current("docs-operating-a-job-failure-handling-strategies") %>>
+                      <a href="/docs/operating-a-job/failure-handling-strategies/index.html">Failure Recovery Strategies</a>
+                      <ul class="nav">
+                        <li<%= sidebar_current("docs-operating-a-job-failure-handling-strategies-local-restarts") %>>
+                          <a href="/docs/operating-a-job/failure-handling-strategies/restart.html">Local Restarts</a>
+                        </li>
+                        <li<%= sidebar_current("docs-operating-a-job-failure-handling-strategies-check-restart") %>>
+                          <a href="/docs/operating-a-job/failure-handling-strategies/check-restart.html">Check Restarts</a>
+                        </li>
+                        <li<%= sidebar_current("docs-operating-a-job-failure-handling-strategies-reschedule") %>>
+                          <a href="/docs/operating-a-job/failure-handling-strategies/reschedule.html">Rescheduling</a>
+                        </li>
+                      </ul>
+                    </li>
         </ul>
       </li>