open-nomad/e2e/servicediscovery/nomad_checks_test.go
Seth Hoenig 7375fd40fc
nsd: block on removal of services (#15862)
* nsd: block on removal of services

This PR uses a WaitGroup to ensure workload removals are complete
before returning from ServiceRegistrationHandler.RemoveWorkload of
the nomad service provider. The de-registration of individual services
still occurs asynchrously, but we must block on the parent removal
call so that we do not race with further operations on the same set
of services - e.g. in the case of a task restart where we de-register
and then re-register the services in quick succession.

Fixes #15032

* nsd: add e2e test for initial failing check and restart
2023-01-26 08:17:57 -06:00

171 lines
5.2 KiB
Go

package servicediscovery
import (
"context"
"regexp"
"strings"
"testing"
"time"
"github.com/hashicorp/nomad/e2e/e2eutil"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/shoenig/test"
"github.com/shoenig/test/must"
"github.com/shoenig/test/wait"
"github.com/stretchr/testify/require"
)
func testChecksHappy(t *testing.T) {
nomadClient := e2eutil.NomadClient(t)
// Generate our unique job ID which will be used for this test.
jobID := "nsd-check-happy-" + uuid.Short()
jobIDs := []string{jobID}
// Defer a cleanup function to remove the job. This will trigger if the
// test fails, unless the cancel function is called.
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
defer e2eutil.CleanupJobsAndGCWithContext(t, ctx, &jobIDs)
// Register the happy checks job.
allocStubs := e2eutil.RegisterAndWaitForAllocs(t, nomadClient, jobChecksHappy, jobID, "")
must.Len(t, 1, allocStubs)
// wait for the alloc to be running
e2eutil.WaitForAllocRunning(t, nomadClient, allocStubs[0].ID)
// Get and test the output of 'nomad alloc checks'.
require.Eventually(t, func() bool {
output, err := e2eutil.AllocChecks(allocStubs[0].ID)
if err != nil {
return false
}
// assert the output contains success
statusRe := regexp.MustCompile(`Status\s+=\s+success`)
if !statusRe.MatchString(output) {
return false
}
// assert the output contains 200 status code
statusCodeRe := regexp.MustCompile(`StatusCode\s+=\s+200`)
if !statusCodeRe.MatchString(output) {
return false
}
// assert output contains nomad's success string
return strings.Contains(output, `nomad: http ok`)
}, 5*time.Second, 200*time.Millisecond)
}
func testChecksSad(t *testing.T) {
nomadClient := e2eutil.NomadClient(t)
// Generate our unique job ID which will be used for this test.
jobID := "nsd-check-sad-" + uuid.Short()
jobIDs := []string{jobID}
// Defer a cleanup function to remove the job. This will trigger if the
// test fails, unless the cancel function is called.
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
defer e2eutil.CleanupJobsAndGCWithContext(t, ctx, &jobIDs)
// Register the sad checks job.
allocStubs := e2eutil.RegisterAndWaitForAllocs(t, nomadClient, jobChecksSad, jobID, "")
must.Len(t, 1, allocStubs)
// wait for the alloc to be running
e2eutil.WaitForAllocRunning(t, nomadClient, allocStubs[0].ID)
// Get and test the output of 'nomad alloc checks'.
require.Eventually(t, func() bool {
output, err := e2eutil.AllocChecks(allocStubs[0].ID)
if err != nil {
return false
}
// assert the output contains failure
statusRe := regexp.MustCompile(`Status\s+=\s+failure`)
if !statusRe.MatchString(output) {
return false
}
// assert the output contains 501 status code
statusCodeRe := regexp.MustCompile(`StatusCode\s+=\s+501`)
if !statusCodeRe.MatchString(output) {
return false
}
// assert output contains error output from python http.server
return strings.Contains(output, `<p>Error code explanation: HTTPStatus.NOT_IMPLEMENTED - Server does not support this operation.</p>`)
}, 5*time.Second, 200*time.Millisecond)
}
func testChecksServiceReRegisterAfterCheckRestart(t *testing.T) {
const (
jobChecksAfterRestartMain = "./input/checks_task_restart_main.nomad"
jobChecksAfterRestartHelper = "./input/checks_task_restart_helper.nomad"
)
nomadClient := e2eutil.NomadClient(t)
idJobMain := "nsd-check-restart-services-" + uuid.Short()
idJobHelper := "nsd-check-restart-services-helper-" + uuid.Short()
jobIDs := []string{idJobMain, idJobHelper}
// Defer a cleanup function to remove the job. This will trigger if the
// test fails, unless the cancel function is called.
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
defer e2eutil.CleanupJobsAndGCWithContext(t, ctx, &jobIDs)
// register the main job
allocStubs := e2eutil.RegisterAndWaitForAllocs(t, nomadClient, jobChecksAfterRestartMain, idJobMain, "")
must.Len(t, 1, allocStubs)
// wait for task restart due to failing health check
must.Wait(t, wait.InitialSuccess(
wait.BoolFunc(func() bool {
allocEvents, err := e2eutil.AllocTaskEventsForJob(idJobMain, "")
if err != nil {
t.Log("failed to get task events for job", idJobMain, err)
return false
}
for _, events := range allocEvents {
for _, event := range events {
if event["Type"] == "Restarting" {
return true
}
}
}
return false
}),
wait.Timeout(30*time.Second),
wait.Gap(3*time.Second),
))
runHelper := func(command string) {
vars := []string{"-var", "nodeID=" + allocStubs[0].NodeID, "-var", "cmd=touch", "-var", "delay=3s"}
err := e2eutil.RegisterWithArgs(idJobHelper, jobChecksAfterRestartHelper, vars...)
test.NoError(t, err)
}
// register helper job, triggering check to start passing
runHelper("touch")
defer func() {
runHelper("rm")
}()
// wait for main task to become healthy
e2eutil.WaitForAllocStatus(t, nomadClient, allocStubs[0].ID, structs.AllocClientStatusRunning)
// finally assert we have services
services := nomadClient.Services()
serviceStubs, _, err := services.Get("nsd-checks-task-restart-test", nil)
must.NoError(t, err)
must.Len(t, 1, serviceStubs)
}