open-nomad/e2e/clientstate/allocs_test.go
Seth Hoenig e04ff0d935
client: ignore restart issued to terminal allocations (#17175)
* client: ignore restart issued to terminal allocations

This PR fixes a bug where issuing a restart to a terminal allocation
would cause the allocation to run its hooks anyway. This was particularly
apparent with group_service_hook who would then register services but
then never deregister them - as the allocation would be effectively in
a "zombie" state where it is prepped to run tasks but never will.

* e2e: add e2e test for alloc restart zombies

* cl: tweak text

Co-authored-by: Tim Gross <tgross@hashicorp.com>

---------

Co-authored-by: Tim Gross <tgross@hashicorp.com>
2023-05-16 10:19:41 -05:00

62 lines
1.7 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package clientstate
import (
"testing"
"time"
"github.com/hashicorp/nomad/e2e/e2eutil"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/shoenig/test/must"
"github.com/shoenig/test/wait"
)
func TestClientAllocs(t *testing.T) {
nomad := e2eutil.NomadClient(t)
e2eutil.WaitForLeader(t, nomad)
e2eutil.WaitForNodesReady(t, nomad, 1)
t.Run("testAllocZombie", testAllocZombie)
}
// testAllocZombie ensures that a restart of a dead allocation does not cause
// it to come back to life in a not-quite alive state.
//
// https://github.com/hashicorp/nomad/issues/17079
func testAllocZombie(t *testing.T) {
nomad := e2eutil.NomadClient(t)
jobID := "alloc-zombie-" + uuid.Short()
jobIDs := []string{jobID}
t.Cleanup(e2eutil.CleanupJobsAndGC(t, &jobIDs))
// start the job and wait for alloc to become failed
err := e2eutil.Register(jobID, "./input/alloc_zombie.nomad")
must.NoError(t, err)
allocID := e2eutil.SingleAllocID(t, jobID, "", 0)
// wait for alloc to be marked as failed
e2eutil.WaitForAllocStatus(t, nomad, allocID, "failed")
// wait for additional failures to know we got rescheduled
must.Wait(t, wait.InitialSuccess(
wait.BoolFunc(func() bool {
statuses, err := e2eutil.AllocStatusesRescheduled(jobID, "")
must.NoError(t, err)
return len(statuses) > 2
}),
wait.Timeout(1*time.Minute),
wait.Gap(1*time.Second),
))
// now attempt to restart our initial allocation
// which should do nothing but give us an error
output, err := e2eutil.Command("nomad", "alloc", "restart", allocID)
must.ErrorContains(t, err, "restart of an alloc that should not run")
must.StrContains(t, output, "Failed to restart allocation")
}