client: ignore restart issued to terminal allocations (#17175)
* client: ignore restart issued to terminal allocations This PR fixes a bug where issuing a restart to a terminal allocation would cause the allocation to run its hooks anyway. This was particularly apparent with group_service_hook who would then register services but then never deregister them - as the allocation would be effectively in a "zombie" state where it is prepped to run tasks but never will. * e2e: add e2e test for alloc restart zombies * cl: tweak text Co-authored-by: Tim Gross <tgross@hashicorp.com> --------- Co-authored-by: Tim Gross <tgross@hashicorp.com>
This commit is contained in:
parent
6814e8e6d9
commit
e04ff0d935
|
@ -0,0 +1,3 @@
|
|||
```release-note:bug
|
||||
client: Fixed a bug where restarting a terminal allocation turns it into a zombie where allocation and task hooks will run unexpectedly
|
||||
```
|
|
@ -1273,6 +1273,12 @@ func (ar *allocRunner) RestartAll(event *structs.TaskEvent) error {
|
|||
|
||||
// restartTasks restarts all task runners concurrently.
|
||||
func (ar *allocRunner) restartTasks(ctx context.Context, event *structs.TaskEvent, failure bool, force bool) error {
|
||||
|
||||
// ensure we are not trying to restart an alloc that is terminal
|
||||
if !ar.shouldRun() {
|
||||
return fmt.Errorf("restart of an alloc that should not run")
|
||||
}
|
||||
|
||||
waitCh := make(chan struct{})
|
||||
var err *multierror.Error
|
||||
var errMutex sync.Mutex
|
||||
|
|
|
@ -0,0 +1,61 @@
|
|||
// Copyright (c) HashiCorp, Inc.
|
||||
// SPDX-License-Identifier: MPL-2.0
|
||||
|
||||
package clientstate
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/nomad/e2e/e2eutil"
|
||||
"github.com/hashicorp/nomad/helper/uuid"
|
||||
"github.com/shoenig/test/must"
|
||||
"github.com/shoenig/test/wait"
|
||||
)
|
||||
|
||||
func TestClientAllocs(t *testing.T) {
|
||||
nomad := e2eutil.NomadClient(t)
|
||||
|
||||
e2eutil.WaitForLeader(t, nomad)
|
||||
e2eutil.WaitForNodesReady(t, nomad, 1)
|
||||
|
||||
t.Run("testAllocZombie", testAllocZombie)
|
||||
}
|
||||
|
||||
// testAllocZombie ensures that a restart of a dead allocation does not cause
|
||||
// it to come back to life in a not-quite alive state.
|
||||
//
|
||||
// https://github.com/hashicorp/nomad/issues/17079
|
||||
func testAllocZombie(t *testing.T) {
|
||||
nomad := e2eutil.NomadClient(t)
|
||||
|
||||
jobID := "alloc-zombie-" + uuid.Short()
|
||||
jobIDs := []string{jobID}
|
||||
t.Cleanup(e2eutil.CleanupJobsAndGC(t, &jobIDs))
|
||||
|
||||
// start the job and wait for alloc to become failed
|
||||
err := e2eutil.Register(jobID, "./input/alloc_zombie.nomad")
|
||||
must.NoError(t, err)
|
||||
|
||||
allocID := e2eutil.SingleAllocID(t, jobID, "", 0)
|
||||
|
||||
// wait for alloc to be marked as failed
|
||||
e2eutil.WaitForAllocStatus(t, nomad, allocID, "failed")
|
||||
|
||||
// wait for additional failures to know we got rescheduled
|
||||
must.Wait(t, wait.InitialSuccess(
|
||||
wait.BoolFunc(func() bool {
|
||||
statuses, err := e2eutil.AllocStatusesRescheduled(jobID, "")
|
||||
must.NoError(t, err)
|
||||
return len(statuses) > 2
|
||||
}),
|
||||
wait.Timeout(1*time.Minute),
|
||||
wait.Gap(1*time.Second),
|
||||
))
|
||||
|
||||
// now attempt to restart our initial allocation
|
||||
// which should do nothing but give us an error
|
||||
output, err := e2eutil.Command("nomad", "alloc", "restart", allocID)
|
||||
must.ErrorContains(t, err, "restart of an alloc that should not run")
|
||||
must.StrContains(t, output, "Failed to restart allocation")
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
# Copyright (c) HashiCorp, Inc.
|
||||
# SPDX-License-Identifier: MPL-2.0
|
||||
|
||||
job "alloc_zombie" {
|
||||
|
||||
group "group" {
|
||||
network {
|
||||
mode = "host"
|
||||
port "http" {}
|
||||
}
|
||||
|
||||
service {
|
||||
name = "alloczombie"
|
||||
port = "http"
|
||||
provider = "nomad"
|
||||
|
||||
check {
|
||||
name = "alloczombiecheck"
|
||||
type = "http"
|
||||
port = "http"
|
||||
path = "/does/not/exist.txt"
|
||||
interval = "2s"
|
||||
timeout = "1s"
|
||||
check_restart {
|
||||
limit = 1
|
||||
grace = "3s"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
reschedule {
|
||||
attempts = 3
|
||||
interval = "1m"
|
||||
delay = "5s"
|
||||
delay_function = "constant"
|
||||
unlimited = false
|
||||
}
|
||||
|
||||
restart {
|
||||
attempts = 0
|
||||
delay = "5s"
|
||||
mode = "fail"
|
||||
}
|
||||
|
||||
task "python" {
|
||||
driver = "raw_exec"
|
||||
|
||||
config {
|
||||
command = "python3"
|
||||
args = ["-m", "http.server", "${NOMAD_PORT_http}", "--directory", "/tmp"]
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 10
|
||||
memory = 64
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue