open-nomad/client/allocrunner/alloc_runner_unix_test.go

// +build !windows

package allocrunner

import (
	"encoding/json"
	"fmt"
	"os"
	"syscall"
	"testing"
	"time"

	"github.com/hashicorp/nomad/client/state"
	"github.com/hashicorp/nomad/nomad/mock"
	"github.com/hashicorp/nomad/nomad/structs"
	"github.com/hashicorp/nomad/testutil"
	"github.com/stretchr/testify/require"
)

// TestAllocRunner_Restore_RunningTerminal asserts that restoring a terminal
// alloc with a running task properly kills the running the task. This is meant
// to simulate a Nomad agent crash after receiving an updated alloc with
// DesiredStatus=Stop, persisting the update, but crashing before terminating
// the task.
func TestAllocRunner_Restore_RunningTerminal(t *testing.T) {
	t.Parallel()

	// 1. Run task
	// 2. Shutdown alloc runner
	// 3. Set alloc.desiredstatus=false
	// 4. Start new alloc runner
	// 5. Assert task and logmon are cleaned up

	alloc := mock.Alloc()
	task := alloc.Job.TaskGroups[0].Tasks[0]
	task.Driver = "mock_driver"
	task.Config = map[string]interface{}{
		"run_for": "1h",
	}

	conf, cleanup := testAllocRunnerConfig(t, alloc.Copy())
	defer cleanup()

	// Maintain state for subsequent run
	conf.StateDB = state.NewMemDB(conf.Logger)

	// Start and wait for task to be running
	ar, err := NewAllocRunner(conf)
	require.NoError(t, err)
	go ar.Run()
	defer destroy(ar)

	testutil.WaitForResult(func() (bool, error) {
		s := ar.AllocState()
		return s.ClientStatus == structs.AllocClientStatusRunning, fmt.Errorf("expected running, got %s", s.ClientStatus)
	}, func(err error) {
		require.NoError(t, err)
	})

	// Shutdown the AR and manually change the state to mimic a crash where
	// a stopped alloc update is received, but Nomad crashes before
	// stopping the alloc.
	ar.Shutdown()
	select {
	case <-ar.ShutdownCh():
	case <-time.After(30 * time.Second):
		require.Fail(t, "AR took too long to exit")
	}

	// Assert logmon is still running. This is a super ugly hack that pulls
	// logmon's PID out of its reattach config, but it does properly ensure
	// logmon gets cleaned up.
	ls, _, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name)
	require.NoError(t, err)
	require.NotNil(t, ls)

	logmonReattach := struct {
		Pid int
	}{}
	err = json.Unmarshal([]byte(ls.Hooks["logmon"].Data["reattach_config"]), &logmonReattach)
	require.NoError(t, err)

	logmonProc, _ := os.FindProcess(logmonReattach.Pid)
	require.NoError(t, logmonProc.Signal(syscall.Signal(0)))

	// Fake alloc terminal during Restore()
	alloc.DesiredStatus = structs.AllocDesiredStatusStop
	alloc.ModifyIndex++
	alloc.AllocModifyIndex++

	// Start a new alloc runner and assert it gets stopped
	conf2, cleanup2 := testAllocRunnerConfig(t, alloc)
	defer cleanup2()

	// Use original statedb to maintain hook state
	conf2.StateDB = conf.StateDB

	// Restore, start, and wait for task to be killed
	ar2, err := NewAllocRunner(conf2)
	require.NoError(t, err)

	require.NoError(t, ar2.Restore())

	go ar2.Run()
	defer destroy(ar2)

	select {
	case <-ar2.WaitCh():
	case <-time.After(30 * time.Second):
	}

	// Assert logmon was cleaned up
	require.Error(t, logmonProc.Signal(syscall.Signal(0)))
}
client: test logmon cleanup The test is sadly quite complicated and peeks into things (logmon's reattach config) AR doesn't normally have access to. However, I couldn't find another way of asserting logmon got cleaned up without resorting to smaller unit tests. Smaller unit tests risk re-implementing dependencies in an unrealistic way, so I opted for an ugly integration test. 2019-03-01 23:02:53 +00:00			`// +build !windows`

			`package allocrunner`

			`import (`
			`"encoding/json"`
			`"fmt"`
			`"os"`
			`"syscall"`
			`"testing"`
			`"time"`

			`"github.com/hashicorp/nomad/client/state"`
			`"github.com/hashicorp/nomad/nomad/mock"`
			`"github.com/hashicorp/nomad/nomad/structs"`
			`"github.com/hashicorp/nomad/testutil"`
			`"github.com/stretchr/testify/require"`
			`)`

			`// TestAllocRunner_Restore_RunningTerminal asserts that restoring a terminal`
			`// alloc with a running task properly kills the running the task. This is meant`
			`// to simulate a Nomad agent crash after receiving an updated alloc with`
			`// DesiredStatus=Stop, persisting the update, but crashing before terminating`
			`// the task.`
			`func TestAllocRunner_Restore_RunningTerminal(t *testing.T) {`
			`t.Parallel()`

			`// 1. Run task`
			`// 2. Shutdown alloc runner`
			`// 3. Set alloc.desiredstatus=false`
			`// 4. Start new alloc runner`
			`// 5. Assert task and logmon are cleaned up`

			`alloc := mock.Alloc()`
			`task := alloc.Job.TaskGroups[0].Tasks[0]`
			`task.Driver = "mock_driver"`
			`task.Config = map[string]interface{}{`
			`"run_for": "1h",`
			`}`

			`conf, cleanup := testAllocRunnerConfig(t, alloc.Copy())`
			`defer cleanup()`

			`// Maintain state for subsequent run`
			`conf.StateDB = state.NewMemDB(conf.Logger)`

			`// Start and wait for task to be running`
			`ar, err := NewAllocRunner(conf)`
			`require.NoError(t, err)`
			`go ar.Run()`
			`defer destroy(ar)`

			`testutil.WaitForResult(func() (bool, error) {`
			`s := ar.AllocState()`
			`return s.ClientStatus == structs.AllocClientStatusRunning, fmt.Errorf("expected running, got %s", s.ClientStatus)`
			`}, func(err error) {`
			`require.NoError(t, err)`
			`})`

			`// Shutdown the AR and manually change the state to mimic a crash where`
			`// a stopped alloc update is received, but Nomad crashes before`
			`// stopping the alloc.`
			`ar.Shutdown()`
			`select {`
			`case <-ar.ShutdownCh():`
			`case <-time.After(30 * time.Second):`
			`require.Fail(t, "AR took too long to exit")`
			`}`

			`// Assert logmon is still running. This is a super ugly hack that pulls`
			`// logmon's PID out of its reattach config, but it does properly ensure`
			`// logmon gets cleaned up.`
			`ls, _, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name)`
			`require.NoError(t, err)`
			`require.NotNil(t, ls)`

			`logmonReattach := struct {`
			`Pid int`
			`}{}`
			`err = json.Unmarshal([]byte(ls.Hooks["logmon"].Data["reattach_config"]), &logmonReattach)`
			`require.NoError(t, err)`

			`logmonProc, _ := os.FindProcess(logmonReattach.Pid)`
			`require.NoError(t, logmonProc.Signal(syscall.Signal(0)))`

			`// Fake alloc terminal during Restore()`
			`alloc.DesiredStatus = structs.AllocDesiredStatusStop`
			`alloc.ModifyIndex++`
			`alloc.AllocModifyIndex++`

			`// Start a new alloc runner and assert it gets stopped`
			`conf2, cleanup2 := testAllocRunnerConfig(t, alloc)`
			`defer cleanup2()`

			`// Use original statedb to maintain hook state`
			`conf2.StateDB = conf.StateDB`

			`// Restore, start, and wait for task to be killed`
			`ar2, err := NewAllocRunner(conf2)`
			`require.NoError(t, err)`

			`require.NoError(t, ar2.Restore())`

			`go ar2.Run()`
			`defer destroy(ar2)`

			`select {`
			`case <-ar2.WaitCh():`
			`case <-time.After(30 * time.Second):`
			`}`

			`// Assert logmon was cleaned up`
			`require.Error(t, logmonProc.Signal(syscall.Signal(0)))`
			`}`