open-nomad/client/allocrunner/alloc_runner_test.go

package allocrunner

import (
	"fmt"
	"testing"
	"time"

	"github.com/hashicorp/consul/api"
	"github.com/hashicorp/nomad/client/allochealth"
	cconsul "github.com/hashicorp/nomad/client/consul"
	"github.com/hashicorp/nomad/client/state"
	"github.com/hashicorp/nomad/command/agent/consul"
	"github.com/hashicorp/nomad/helper/uuid"
	"github.com/hashicorp/nomad/nomad/mock"
	"github.com/hashicorp/nomad/nomad/structs"
	"github.com/hashicorp/nomad/testutil"
	"github.com/stretchr/testify/require"
)

// TestAllocRunner_AllocState_Initialized asserts that getting TaskStates via
// AllocState() are initialized even before the AllocRunner has run.
func TestAllocRunner_AllocState_Initialized(t *testing.T) {
	t.Parallel()

	alloc := mock.Alloc()
	alloc.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver"
	conf, cleanup := testAllocRunnerConfig(t, alloc)
	defer cleanup()

	ar, err := NewAllocRunner(conf)
	require.NoError(t, err)

	allocState := ar.AllocState()

	require.NotNil(t, allocState)
	require.NotNil(t, allocState.TaskStates[conf.Alloc.Job.TaskGroups[0].Tasks[0].Name])
}

// TestAllocRunner_TaskLeader_KillTG asserts that when a leader task dies the
// entire task group is killed.
func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {
	t.Parallel()

	alloc := mock.BatchAlloc()
	tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0

	// Create two tasks in the task group
	task := alloc.Job.TaskGroups[0].Tasks[0]
	task.Name = "task1"
	task.Driver = "mock_driver"
	task.KillTimeout = 10 * time.Millisecond
	task.Config = map[string]interface{}{
		"run_for": "10s",
	}

	task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
	task2.Name = "task2"
	task2.Driver = "mock_driver"
	task2.Leader = true
	task2.Config = map[string]interface{}{
		"run_for": "1s",
	}
	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2)
	alloc.AllocatedResources.Tasks[task.Name] = tr
	alloc.AllocatedResources.Tasks[task2.Name] = tr

	conf, cleanup := testAllocRunnerConfig(t, alloc)
	defer cleanup()
	ar, err := NewAllocRunner(conf)
	require.NoError(t, err)
	defer ar.Destroy()
	go ar.Run()

	// Wait for all tasks to be killed
	upd := conf.StateUpdater.(*MockStateUpdater)
	testutil.WaitForResult(func() (bool, error) {
		last := upd.Last()
		if last == nil {
			return false, fmt.Errorf("No updates")
		}
		if last.ClientStatus != structs.AllocClientStatusComplete {
			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
		}

		// Task1 should be killed because Task2 exited
		state1 := last.TaskStates[task.Name]
		if state1.State != structs.TaskStateDead {
			return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
		}
		if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() {
			return false, fmt.Errorf("expected to have a start and finish time")
		}
		if len(state1.Events) < 2 {
			// At least have a received and destroyed
			return false, fmt.Errorf("Unexpected number of events")
		}

		found := false
		for _, e := range state1.Events {
			if e.Type != structs.TaskLeaderDead {
				found = true
			}
		}

		if !found {
			return false, fmt.Errorf("Did not find event %v", structs.TaskLeaderDead)
		}

		// Task Two should be dead
		state2 := last.TaskStates[task2.Name]
		if state2.State != structs.TaskStateDead {
			return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead)
		}
		if state2.FinishedAt.IsZero() || state2.StartedAt.IsZero() {
			return false, fmt.Errorf("expected to have a start and finish time")
		}

		return true, nil
	}, func(err error) {
		t.Fatalf("err: %v", err)
	})
}

// TestAllocRunner_TaskLeader_StopTG asserts that when stopping an alloc with a
// leader the leader is stopped before other tasks.
func TestAllocRunner_TaskLeader_StopTG(t *testing.T) {
	t.Parallel()

	alloc := mock.Alloc()
	tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0

	// Create 3 tasks in the task group
	task := alloc.Job.TaskGroups[0].Tasks[0]
	task.Name = "follower1"
	task.Driver = "mock_driver"
	task.Config = map[string]interface{}{
		"run_for": "10s",
	}

	task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
	task2.Name = "leader"
	task2.Driver = "mock_driver"
	task2.Leader = true
	task2.Config = map[string]interface{}{
		"run_for": "10s",
	}

	task3 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
	task3.Name = "follower2"
	task3.Driver = "mock_driver"
	task3.Config = map[string]interface{}{
		"run_for": "10s",
	}
	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2, task3)
	alloc.AllocatedResources.Tasks[task.Name] = tr
	alloc.AllocatedResources.Tasks[task2.Name] = tr
	alloc.AllocatedResources.Tasks[task3.Name] = tr

	conf, cleanup := testAllocRunnerConfig(t, alloc)
	defer cleanup()
	ar, err := NewAllocRunner(conf)
	require.NoError(t, err)
	defer ar.Destroy()
	go ar.Run()

	// Wait for tasks to start
	upd := conf.StateUpdater.(*MockStateUpdater)
	last := upd.Last()
	testutil.WaitForResult(func() (bool, error) {
		last = upd.Last()
		if last == nil {
			return false, fmt.Errorf("No updates")
		}
		if n := len(last.TaskStates); n != 3 {
			return false, fmt.Errorf("Not enough task states (want: 3; found %d)", n)
		}
		for name, state := range last.TaskStates {
			if state.State != structs.TaskStateRunning {
				return false, fmt.Errorf("Task %q is not running yet (it's %q)", name, state.State)
			}
		}
		return true, nil
	}, func(err error) {
		t.Fatalf("err: %v", err)
	})

	// Reset updates
	upd.Reset()

	// Stop alloc
	update := alloc.Copy()
	update.DesiredStatus = structs.AllocDesiredStatusStop
	ar.Update(update)

	// Wait for tasks to stop
	testutil.WaitForResult(func() (bool, error) {
		last := upd.Last()
		if last == nil {
			return false, fmt.Errorf("No updates")
		}
		if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower1"].FinishedAt.UnixNano() {
			return false, fmt.Errorf("expected leader to finish before follower1: %s >= %s",
				last.TaskStates["leader"].FinishedAt, last.TaskStates["follower1"].FinishedAt)
		}
		if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower2"].FinishedAt.UnixNano() {
			return false, fmt.Errorf("expected leader to finish before follower2: %s >= %s",
				last.TaskStates["leader"].FinishedAt, last.TaskStates["follower2"].FinishedAt)
		}
		return true, nil
	}, func(err error) {
		last := upd.Last()
		for name, state := range last.TaskStates {
			t.Logf("%s: %s", name, state.State)
		}
		t.Fatalf("err: %v", err)
	})
}

// TestAllocRunner_TaskLeader_StopRestoredTG asserts that when stopping a
// restored task group with a leader that failed before restoring the leader is
// not stopped as it does not exist.
// See https://github.com/hashicorp/nomad/issues/3420#issuecomment-341666932
func TestAllocRunner_TaskLeader_StopRestoredTG(t *testing.T) {
	t.Parallel()

	alloc := mock.Alloc()
	tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0

	// Create a leader and follower task in the task group
	task := alloc.Job.TaskGroups[0].Tasks[0]
	task.Name = "follower1"
	task.Driver = "mock_driver"
	task.KillTimeout = 10 * time.Second
	task.Config = map[string]interface{}{
		"run_for": "10s",
	}

	task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
	task2.Name = "leader"
	task2.Driver = "mock_driver"
	task2.Leader = true
	task2.KillTimeout = 10 * time.Millisecond
	task2.Config = map[string]interface{}{
		"run_for": "10s",
	}

	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2)
	alloc.AllocatedResources.Tasks[task.Name] = tr
	alloc.AllocatedResources.Tasks[task2.Name] = tr

	conf, cleanup := testAllocRunnerConfig(t, alloc)
	defer cleanup()

	// Use a memory backed statedb
	conf.StateDB = state.NewMemDB()

	ar, err := NewAllocRunner(conf)
	require.NoError(t, err)

	// Mimic Nomad exiting before the leader stopping is able to stop other tasks.
	ar.tasks["leader"].UpdateState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskKilled))
	ar.tasks["follower1"].UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted))

	// Create a new AllocRunner to test RestoreState and Run
	ar2, err := NewAllocRunner(conf)
	require.NoError(t, err)
	defer ar2.Destroy()

	if err := ar2.Restore(); err != nil {
		t.Fatalf("error restoring state: %v", err)
	}
	ar2.Run()

	// Wait for tasks to be stopped because leader is dead
	testutil.WaitForResult(func() (bool, error) {
		alloc := ar2.Alloc()
		for task, state := range alloc.TaskStates {
			if state.State != structs.TaskStateDead {
				return false, fmt.Errorf("Task %q should be dead: %v", task, state.State)
			}
		}
		return true, nil
	}, func(err error) {
		t.Fatalf("err: %v", err)
	})

	// Make sure it GCs properly
	ar2.Destroy()

	select {
	case <-ar2.DestroyCh():
		// exited as expected
	case <-time.After(10 * time.Second):
		t.Fatalf("timed out waiting for AR to GC")
	}
}

func TestAllocRunner_Update_Semantics(t *testing.T) {
	t.Parallel()
	require := require.New(t)

	updatedAlloc := func(a *structs.Allocation) *structs.Allocation {
		upd := a.CopySkipJob()
		upd.AllocModifyIndex++

		return upd
	}

	alloc := mock.Alloc()
	alloc.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver"
	conf, cleanup := testAllocRunnerConfig(t, alloc)
	defer cleanup()

	ar, err := NewAllocRunner(conf)
	require.NoError(err)

	upd1 := updatedAlloc(alloc)
	ar.Update(upd1)

	// Update was placed into a queue
	require.Len(ar.allocUpdatedCh, 1)

	upd2 := updatedAlloc(alloc)
	ar.Update(upd2)

	// Allocation was _replaced_

	require.Len(ar.allocUpdatedCh, 1)
	queuedAlloc := <-ar.allocUpdatedCh
	require.Equal(upd2, queuedAlloc)

	// Requeueing older alloc is skipped
	ar.Update(upd2)
	ar.Update(upd1)

	queuedAlloc = <-ar.allocUpdatedCh
	require.Equal(upd2, queuedAlloc)

	// Ignore after watch closed

	close(ar.waitCh)

	ar.Update(upd1)

	// Did not queue the update
	require.Len(ar.allocUpdatedCh, 0)
}

// TestAllocRunner_DeploymentHealth_Healthy_Migration asserts that health is
// reported for services that got migrated; not just part of deployments.
func TestAllocRunner_DeploymentHealth_Healthy_Migration(t *testing.T) {
	t.Parallel()

	alloc := mock.Alloc()

	// Ensure the alloc is *not* part of a deployment
	alloc.DeploymentID = ""

	// Shorten the default migration healthy time
	tg := alloc.Job.TaskGroups[0]
	tg.Migrate = structs.DefaultMigrateStrategy()
	tg.Migrate.MinHealthyTime = 100 * time.Millisecond
	tg.Migrate.HealthCheck = structs.MigrateStrategyHealthStates

	task := tg.Tasks[0]
	task.Driver = "mock_driver"
	task.Config = map[string]interface{}{
		"run_for": "30s",
	}

	conf, cleanup := testAllocRunnerConfig(t, alloc)
	defer cleanup()

	ar, err := NewAllocRunner(conf)
	require.NoError(t, err)
	go ar.Run()
	defer ar.Destroy()

	upd := conf.StateUpdater.(*MockStateUpdater)
	testutil.WaitForResult(func() (bool, error) {
		last := upd.Last()
		if last == nil {
			return false, fmt.Errorf("No updates")
		}
		if !last.DeploymentStatus.HasHealth() {
			return false, fmt.Errorf("want deployment status unhealthy; got unset")
		} else if !*last.DeploymentStatus.Healthy {
			// This is fatal
			t.Fatal("want deployment status healthy; got unhealthy")
		}
		return true, nil
	}, func(err error) {
		require.NoError(t, err)
	})
}

// TestAllocRunner_DeploymentHealth_Healthy_NoChecks asserts that the health
// watcher will mark the allocation as healthy based on task states alone.
func TestAllocRunner_DeploymentHealth_Healthy_NoChecks(t *testing.T) {
	t.Parallel()

	alloc := mock.Alloc()

	task := alloc.Job.TaskGroups[0].Tasks[0]
	task.Driver = "mock_driver"
	task.Config = map[string]interface{}{
		"run_for": "10s",
	}

	// Create a task that takes longer to become healthy
	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task.Copy())
	alloc.AllocatedResources.Tasks["task2"] = alloc.AllocatedResources.Tasks["web"].Copy()
	task2 := alloc.Job.TaskGroups[0].Tasks[1]
	task2.Name = "task2"
	task2.Config["start_block_for"] = "500ms"

	// Make the alloc be part of a deployment that uses task states for
	// health checks
	alloc.DeploymentID = uuid.Generate()
	alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
	alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates
	alloc.Job.TaskGroups[0].Update.MaxParallel = 1
	alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond

	conf, cleanup := testAllocRunnerConfig(t, alloc)
	defer cleanup()

	ar, err := NewAllocRunner(conf)
	require.NoError(t, err)

	start, done := time.Now(), time.Time{}
	go ar.Run()
	defer ar.Destroy()

	upd := conf.StateUpdater.(*MockStateUpdater)
	testutil.WaitForResult(func() (bool, error) {
		last := upd.Last()
		if last == nil {
			return false, fmt.Errorf("No updates")
		}
		if !last.DeploymentStatus.HasHealth() {
			return false, fmt.Errorf("want deployment status unhealthy; got unset")
		} else if !*last.DeploymentStatus.Healthy {
			// This is fatal
			t.Fatal("want deployment status healthy; got unhealthy")
		}

		// Capture the done timestamp
		done = last.DeploymentStatus.Timestamp
		return true, nil
	}, func(err error) {
		require.NoError(t, err)
	})

	if d := done.Sub(start); d < 500*time.Millisecond {
		t.Fatalf("didn't wait for second task group. Only took %v", d)
	}
}

// TestAllocRunner_DeploymentHealth_Unhealthy_Checks asserts that the health
// watcher will mark the allocation as unhealthy with failing checks.
func TestAllocRunner_DeploymentHealth_Unhealthy_Checks(t *testing.T) {
	t.Skip("FIXME(schmichael): fails and needs fixing")
	t.Parallel()

	alloc := mock.Alloc()
	task := alloc.Job.TaskGroups[0].Tasks[0]
	task.Driver = "mock_driver"
	task.Config = map[string]interface{}{
		"run_for": "10s",
	}

	// Make the alloc be part of a deployment
	alloc.DeploymentID = uuid.Generate()
	alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
	alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks
	alloc.Job.TaskGroups[0].Update.MaxParallel = 1
	alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond
	alloc.Job.TaskGroups[0].Update.HealthyDeadline = 1 * time.Second

	checkUnhealthy := &api.AgentCheck{
		CheckID: uuid.Generate(),
		Status:  api.HealthWarning,
	}

	conf, cleanup := testAllocRunnerConfig(t, alloc)
	defer cleanup()

	// Only return the check as healthy after a duration
	consulClient := conf.Consul.(*cconsul.MockConsulServiceClient)
	consulClient.AllocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) {
		return &consul.AllocRegistration{
			Tasks: map[string]*consul.TaskRegistration{
				task.Name: {
					Services: map[string]*consul.ServiceRegistration{
						"123": {
							Service: &api.AgentService{Service: "foo"},
							Checks:  []*api.AgentCheck{checkUnhealthy},
						},
					},
				},
			},
		}, nil
	}

	ar, err := NewAllocRunner(conf)
	require.NoError(t, err)
	go ar.Run()
	defer ar.Destroy()

	var lastUpdate *structs.Allocation
	upd := conf.StateUpdater.(*MockStateUpdater)
	testutil.WaitForResult(func() (bool, error) {
		lastUpdate = upd.Last()
		if lastUpdate == nil {
			return false, fmt.Errorf("No updates")
		}
		if !lastUpdate.DeploymentStatus.HasHealth() {
			return false, fmt.Errorf("want deployment status unhealthy; got unset")
		} else if *lastUpdate.DeploymentStatus.Healthy {
			// This is fatal
			t.Fatal("want deployment status unhealthy; got healthy")
		}
		return true, nil
	}, func(err error) {
		require.NoError(t, err)
	})

	// Assert that we have an event explaining why we are unhealthy.
	require.Len(t, lastUpdate.TaskStates, 1)
	state := lastUpdate.TaskStates[task.Name]
	require.NotNil(t, state)
	require.NotEmpty(t, state.Events)
	last := state.Events[len(state.Events)-1]
	require.Equal(t, allochealth.AllocHealthEventSource, last.Type)
	require.Contains(t, last.Message, "Services not healthy by deadline")
}