open-nomad/client/alloc_endpoint_test.go

package client

import (
	"encoding/json"
	"fmt"
	"io"
	"net"
	"runtime"
	"strings"
	"testing"
	"time"

	"github.com/hashicorp/go-msgpack/codec"
	"github.com/hashicorp/nomad/acl"
	"github.com/hashicorp/nomad/ci"
	"github.com/hashicorp/nomad/client/config"
	cstructs "github.com/hashicorp/nomad/client/structs"
	"github.com/hashicorp/nomad/helper/pluginutils/catalog"
	"github.com/hashicorp/nomad/helper/uuid"
	"github.com/hashicorp/nomad/nomad"
	"github.com/hashicorp/nomad/nomad/mock"
	nstructs "github.com/hashicorp/nomad/nomad/structs"
	nconfig "github.com/hashicorp/nomad/nomad/structs/config"
	"github.com/hashicorp/nomad/plugins/drivers"
	"github.com/hashicorp/nomad/testutil"
	"github.com/shoenig/test/must"
	"github.com/stretchr/testify/require"
	"golang.org/x/sys/unix"
)

func TestAllocations_Restart(t *testing.T) {
	ci.Parallel(t)

	require := require.New(t)
	client, cleanup := TestClient(t, nil)
	defer cleanup()

	a := mock.Alloc()
	a.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver"
	a.Job.TaskGroups[0].RestartPolicy = &nstructs.RestartPolicy{
		Attempts: 0,
		Mode:     nstructs.RestartPolicyModeFail,
	}
	a.Job.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
		"run_for": "10s",
	}
	require.Nil(client.addAlloc(a, ""))

	// Try with bad alloc
	req := &nstructs.AllocRestartRequest{}
	var resp nstructs.GenericResponse
	err := client.ClientRPC("Allocations.Restart", &req, &resp)
	require.Error(err)

	// Try with good alloc
	req.AllocID = a.ID

	testutil.WaitForResult(func() (bool, error) {
		var resp2 nstructs.GenericResponse
		err := client.ClientRPC("Allocations.Restart", &req, &resp2)
		if err != nil && strings.Contains(err.Error(), "not running") {
			return false, err
		}

		return true, nil
	}, func(err error) {
		t.Fatalf("err: %v", err)
	})
}

func TestAllocations_RestartAllTasks(t *testing.T) {
	ci.Parallel(t)

	require := require.New(t)
	client, cleanup := TestClient(t, nil)
	defer cleanup()

	alloc := mock.LifecycleAlloc()
	require.Nil(client.addAlloc(alloc, ""))

	// Can't restart all tasks while specifying a task name.
	req := &nstructs.AllocRestartRequest{
		AllocID:  alloc.ID,
		AllTasks: true,
		TaskName: "web",
	}
	var resp nstructs.GenericResponse
	err := client.ClientRPC("Allocations.Restart", &req, &resp)
	require.Error(err)

	// Good request.
	req = &nstructs.AllocRestartRequest{
		AllocID:  alloc.ID,
		AllTasks: true,
	}

	testutil.WaitForResult(func() (bool, error) {
		var resp2 nstructs.GenericResponse
		err := client.ClientRPC("Allocations.Restart", &req, &resp2)
		if err != nil && strings.Contains(err.Error(), "not running") {
			return false, err
		}

		return true, nil
	}, func(err error) {
		t.Fatalf("err: %v", err)
	})
}

func TestAllocations_Restart_ACL(t *testing.T) {
	ci.Parallel(t)
	require := require.New(t)

	server, addr, root, cleanupS := testACLServer(t, nil)
	defer cleanupS()

	client, cleanupC := TestClient(t, func(c *config.Config) {
		c.Servers = []string{addr}
		c.ACLEnabled = true
	})
	defer cleanupC()

	job := mock.BatchJob()
	job.TaskGroups[0].Count = 1
	job.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
		"run_for": "20s",
	}

	// Wait for client to be running job
	alloc := testutil.WaitForRunningWithToken(t, server.RPC, job, root.SecretID)[0]

	// Try request without a token and expect failure
	{
		req := &nstructs.AllocRestartRequest{}
		req.AllocID = alloc.ID
		var resp nstructs.GenericResponse
		err := client.ClientRPC("Allocations.Restart", &req, &resp)
		require.NotNil(err)
		require.ErrorContains(err, nstructs.ErrPermissionDenied.Error())
	}

	// Try request with an invalid token and expect failure
	{
		token := mock.CreatePolicyAndToken(t, server.State(), 1005, "invalid", mock.NamespacePolicy(nstructs.DefaultNamespace, "", []string{}))
		req := &nstructs.AllocRestartRequest{}
		req.AllocID = alloc.ID
		req.AuthToken = token.SecretID

		var resp nstructs.GenericResponse
		err := client.ClientRPC("Allocations.Restart", &req, &resp)

		require.NotNil(err)
		require.EqualError(err, nstructs.ErrPermissionDenied.Error())
	}

	// Try request with a valid token
	{
		policyHCL := mock.NamespacePolicy(nstructs.DefaultNamespace, "", []string{acl.NamespaceCapabilityAllocLifecycle})
		token := mock.CreatePolicyAndToken(t, server.State(), 1007, "valid", policyHCL)
		require.NotNil(token)
		req := &nstructs.AllocRestartRequest{}
		req.AllocID = alloc.ID
		req.AuthToken = token.SecretID
		req.Namespace = nstructs.DefaultNamespace
		var resp nstructs.GenericResponse
		err := client.ClientRPC("Allocations.Restart", &req, &resp)
		require.NoError(err)
		//require.True(nstructs.IsErrUnknownAllocation(err), "Expected unknown alloc, found: %v", err)
	}

	// Try request with a management token
	{
		req := &nstructs.AllocRestartRequest{}
		req.AllocID = alloc.ID
		req.AuthToken = root.SecretID
		var resp nstructs.GenericResponse
		err := client.ClientRPC("Allocations.Restart", &req, &resp)
		// Depending on how quickly the alloc restarts there may be no
		// error *or* a task not running error; either is fine.
		if err != nil {
			require.Contains(err.Error(), "Task not running", err)
		}
	}
}

func TestAllocations_GarbageCollectAll(t *testing.T) {
	ci.Parallel(t)
	require := require.New(t)

	client, cleanup := TestClient(t, nil)
	defer cleanup()

	req := &nstructs.NodeSpecificRequest{}
	var resp nstructs.GenericResponse
	require.Nil(client.ClientRPC("Allocations.GarbageCollectAll", &req, &resp))
}

func TestAllocations_GarbageCollectAll_ACL(t *testing.T) {
	ci.Parallel(t)
	require := require.New(t)

	server, addr, root, cleanupS := testACLServer(t, nil)
	defer cleanupS()

	client, cleanupC := TestClient(t, func(c *config.Config) {
		c.Servers = []string{addr}
		c.ACLEnabled = true
	})
	defer cleanupC()

	// Try request without a token and expect failure
	{
		req := &nstructs.NodeSpecificRequest{}
		var resp nstructs.GenericResponse
		err := client.ClientRPC("Allocations.GarbageCollectAll", &req, &resp)
		require.NotNil(err)
		require.EqualError(err, nstructs.ErrPermissionDenied.Error())
	}

	// Try request with an invalid token and expect failure
	{
		token := mock.CreatePolicyAndToken(t, server.State(), 1005, "invalid", mock.NodePolicy(acl.PolicyDeny))
		req := &nstructs.NodeSpecificRequest{}
		req.AuthToken = token.SecretID

		var resp nstructs.GenericResponse
		err := client.ClientRPC("Allocations.GarbageCollectAll", &req, &resp)

		require.NotNil(err)
		require.EqualError(err, nstructs.ErrPermissionDenied.Error())
	}

	// Try request with a valid token
	{
		token := mock.CreatePolicyAndToken(t, server.State(), 1007, "valid", mock.NodePolicy(acl.PolicyWrite))
		req := &nstructs.NodeSpecificRequest{}
		req.AuthToken = token.SecretID
		var resp nstructs.GenericResponse
		require.Nil(client.ClientRPC("Allocations.GarbageCollectAll", &req, &resp))
	}

	// Try request with a management token
	{
		req := &nstructs.NodeSpecificRequest{}
		req.AuthToken = root.SecretID
		var resp nstructs.GenericResponse
		require.Nil(client.ClientRPC("Allocations.GarbageCollectAll", &req, &resp))
	}
}

func TestAllocations_GarbageCollect(t *testing.T) {
	ci.Parallel(t)
	require := require.New(t)

	client, cleanup := TestClient(t, func(c *config.Config) {
		c.GCDiskUsageThreshold = 100.0
	})
	defer cleanup()

	a := mock.Alloc()
	a.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver"
	rp := &nstructs.RestartPolicy{
		Attempts: 0,
		Mode:     nstructs.RestartPolicyModeFail,
	}
	a.Job.TaskGroups[0].RestartPolicy = rp
	a.Job.TaskGroups[0].Tasks[0].RestartPolicy = rp
	a.Job.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
		"run_for": "10ms",
	}
	require.Nil(client.addAlloc(a, ""))

	// Try with bad alloc
	req := &nstructs.AllocSpecificRequest{}
	var resp nstructs.GenericResponse
	err := client.ClientRPC("Allocations.GarbageCollect", &req, &resp)
	require.NotNil(err)

	// Try with good alloc
	req.AllocID = a.ID
	testutil.WaitForResult(func() (bool, error) {
		// Check if has been removed first
		if ar, ok := client.allocs[a.ID]; !ok || ar.IsDestroyed() {
			return true, nil
		}

		var resp2 nstructs.GenericResponse
		err := client.ClientRPC("Allocations.GarbageCollect", &req, &resp2)
		return err == nil, err
	}, func(err error) {
		t.Fatalf("err: %v", err)
	})
}

func TestAllocations_GarbageCollect_ACL(t *testing.T) {
	ci.Parallel(t)
	require := require.New(t)

	server, addr, root, cleanupS := testACLServer(t, nil)
	defer cleanupS()

	client, cleanupC := TestClient(t, func(c *config.Config) {
		c.Servers = []string{addr}
		c.ACLEnabled = true
	})
	defer cleanupC()

	job := mock.BatchJob()
	job.TaskGroups[0].Count = 1
	job.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
		"run_for": "20s",
	}

	noSuchAllocErr := fmt.Errorf("No such allocation on client or allocation not eligible for GC")

	// Wait for client to be running job
	alloc := testutil.WaitForRunningWithToken(t, server.RPC, job, root.SecretID)[0]

	// Try request without a token and expect failure
	{
		req := &nstructs.AllocSpecificRequest{}
		req.AllocID = alloc.ID
		var resp nstructs.GenericResponse
		err := client.ClientRPC("Allocations.GarbageCollect", &req, &resp)
		require.NotNil(err)
		require.ErrorContains(err, nstructs.ErrPermissionDenied.Error())
	}

	// Try request with an invalid token and expect failure
	{
		token := mock.CreatePolicyAndToken(t, server.State(), 1005, "invalid", mock.NodePolicy(acl.PolicyDeny))
		req := &nstructs.AllocSpecificRequest{}
		req.AllocID = alloc.ID
		req.AuthToken = token.SecretID

		var resp nstructs.GenericResponse
		err := client.ClientRPC("Allocations.GarbageCollect", &req, &resp)

		require.NotNil(err)
		require.EqualError(err, nstructs.ErrPermissionDenied.Error())
	}

	// Try request with a valid token
	{
		token := mock.CreatePolicyAndToken(t, server.State(), 1005, "test-valid",
			mock.NamespacePolicy(nstructs.DefaultNamespace, "", []string{acl.NamespaceCapabilitySubmitJob}))
		req := &nstructs.AllocSpecificRequest{}
		req.AllocID = alloc.ID
		req.AuthToken = token.SecretID
		req.Namespace = nstructs.DefaultNamespace

		var resp nstructs.GenericResponse
		err := client.ClientRPC("Allocations.GarbageCollect", &req, &resp)
		require.Error(err, noSuchAllocErr)
	}

	// Try request with a management token
	{
		req := &nstructs.AllocSpecificRequest{}
		req.AuthToken = root.SecretID

		var resp nstructs.GenericResponse
		err := client.ClientRPC("Allocations.GarbageCollect", &req, &resp)
		require.Error(err, noSuchAllocErr)
	}
}

func TestAllocations_Signal(t *testing.T) {
	ci.Parallel(t)

	client, cleanup := TestClient(t, nil)
	defer cleanup()

	a := mock.Alloc()
	require.Nil(t, client.addAlloc(a, ""))

	// Try with bad alloc
	req := &nstructs.AllocSignalRequest{}
	var resp nstructs.GenericResponse
	err := client.ClientRPC("Allocations.Signal", &req, &resp)
	require.NotNil(t, err)
	require.True(t, nstructs.IsErrUnknownAllocation(err))

	// Try with good alloc
	req.AllocID = a.ID

	var resp2 nstructs.GenericResponse
	err = client.ClientRPC("Allocations.Signal", &req, &resp2)

	require.Error(t, err, "Expected error, got: %s, resp: %#+v", err, resp2)
	require.Contains(t, err.Error(), "Failed to signal task: web, err: Task not running")
}

func TestAllocations_Signal_ACL(t *testing.T) {
	ci.Parallel(t)
	require := require.New(t)

	server, addr, root, cleanupS := testACLServer(t, nil)
	defer cleanupS()

	client, cleanupC := TestClient(t, func(c *config.Config) {
		c.Servers = []string{addr}
		c.ACLEnabled = true
	})
	defer cleanupC()

	job := mock.BatchJob()
	job.TaskGroups[0].Count = 1
	job.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
		"run_for": "20s",
	}

	// Wait for client to be running job
	alloc := testutil.WaitForRunningWithToken(t, server.RPC, job, root.SecretID)[0]

	// Try request without a token and expect failure
	{
		req := &nstructs.AllocSignalRequest{}
		req.AllocID = alloc.ID
		var resp nstructs.GenericResponse
		err := client.ClientRPC("Allocations.Signal", &req, &resp)
		require.NotNil(err)
		require.ErrorContains(err, nstructs.ErrPermissionDenied.Error())
	}

	// Try request with an invalid token and expect failure
	{
		token := mock.CreatePolicyAndToken(t, server.State(), 1005, "invalid", mock.NodePolicy(acl.PolicyDeny))
		req := &nstructs.AllocSignalRequest{}
		req.AllocID = alloc.ID
		req.AuthToken = token.SecretID

		var resp nstructs.GenericResponse
		err := client.ClientRPC("Allocations.Signal", &req, &resp)

		require.NotNil(err)
		require.EqualError(err, nstructs.ErrPermissionDenied.Error())
	}

	// Try request with a valid token
	{
		token := mock.CreatePolicyAndToken(t, server.State(), 1005, "test-valid",
			mock.NamespacePolicy(nstructs.DefaultNamespace, "", []string{acl.NamespaceCapabilityAllocLifecycle}))
		req := &nstructs.AllocSignalRequest{}
		req.AllocID = alloc.ID
		req.AuthToken = token.SecretID
		req.Namespace = nstructs.DefaultNamespace

		var resp nstructs.GenericResponse
		err := client.ClientRPC("Allocations.Signal", &req, &resp)
		require.NoError(err)
	}

	// Try request with a management token
	{
		req := &nstructs.AllocSignalRequest{}
		req.AllocID = alloc.ID
		req.AuthToken = root.SecretID

		var resp nstructs.GenericResponse
		err := client.ClientRPC("Allocations.Signal", &req, &resp)
		require.NoError(err)
	}
}

func TestAllocations_Stats(t *testing.T) {
	ci.Parallel(t)
	require := require.New(t)

	client, cleanup := TestClient(t, nil)
	defer cleanup()

	a := mock.Alloc()
	require.Nil(client.addAlloc(a, ""))

	// Try with bad alloc
	req := &cstructs.AllocStatsRequest{}
	var resp cstructs.AllocStatsResponse
	err := client.ClientRPC("Allocations.Stats", &req, &resp)
	require.NotNil(err)

	// Try with good alloc
	req.AllocID = a.ID
	testutil.WaitForResult(func() (bool, error) {
		var resp2 cstructs.AllocStatsResponse
		err := client.ClientRPC("Allocations.Stats", &req, &resp2)
		if err != nil {
			return false, err
		}
		if resp2.Stats == nil {
			return false, fmt.Errorf("invalid stats object")
		}

		return true, nil
	}, func(err error) {
		t.Fatalf("err: %v", err)
	})
}

func TestAllocations_Stats_ACL(t *testing.T) {
	ci.Parallel(t)
	require := require.New(t)

	server, addr, root, cleanupS := testACLServer(t, nil)
	defer cleanupS()

	client, cleanupC := TestClient(t, func(c *config.Config) {
		c.Servers = []string{addr}
		c.ACLEnabled = true
	})
	defer cleanupC()

	job := mock.BatchJob()
	job.TaskGroups[0].Count = 1
	job.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
		"run_for": "20s",
	}

	// Wait for client to be running job
	alloc := testutil.WaitForRunningWithToken(t, server.RPC, job, root.SecretID)[0]

	// Try request without a token and expect failure
	{
		req := &cstructs.AllocStatsRequest{}
		req.AllocID = alloc.ID
		var resp cstructs.AllocStatsResponse
		err := client.ClientRPC("Allocations.Stats", &req, &resp)
		require.NotNil(err)
		require.ErrorContains(err, nstructs.ErrPermissionDenied.Error())
	}

	// Try request with an invalid token and expect failure
	{
		token := mock.CreatePolicyAndToken(t, server.State(), 1005, "invalid", mock.NodePolicy(acl.PolicyDeny))
		req := &cstructs.AllocStatsRequest{}
		req.AllocID = alloc.ID
		req.AuthToken = token.SecretID

		var resp cstructs.AllocStatsResponse
		err := client.ClientRPC("Allocations.Stats", &req, &resp)

		require.NotNil(err)
		require.EqualError(err, nstructs.ErrPermissionDenied.Error())
	}

	// Try request with a valid token
	{
		token := mock.CreatePolicyAndToken(t, server.State(), 1005, "test-valid",
			mock.NamespacePolicy(nstructs.DefaultNamespace, "", []string{acl.NamespaceCapabilityReadJob}))
		req := &cstructs.AllocStatsRequest{}
		req.AllocID = alloc.ID
		req.AuthToken = token.SecretID
		req.Namespace = nstructs.DefaultNamespace

		var resp cstructs.AllocStatsResponse
		err := client.ClientRPC("Allocations.Stats", &req, &resp)
		require.NoError(err)
	}

	// Try request with a management token
	{
		req := &cstructs.AllocStatsRequest{}
		req.AllocID = alloc.ID
		req.AuthToken = root.SecretID

		var resp cstructs.AllocStatsResponse
		err := client.ClientRPC("Allocations.Stats", &req, &resp)
		require.NoError(err)
	}
}

func TestAlloc_Checks(t *testing.T) {
	ci.Parallel(t)

	client, cleanup := TestClient(t, nil)
	t.Cleanup(func() {
		must.NoError(t, cleanup())
	})

	now := time.Date(2022, 3, 4, 5, 6, 7, 8, time.UTC).Unix()

	qr1 := &nstructs.CheckQueryResult{
		ID:        "abc123",
		Mode:      "healthiness",
		Status:    "passing",
		Output:    "nomad: http ok",
		Timestamp: now,
		Group:     "group",
		Task:      "task",
		Service:   "service",
		Check:     "check",
	}

	qr2 := &nstructs.CheckQueryResult{
		ID:        "def456",
		Mode:      "readiness",
		Status:    "passing",
		Output:    "nomad: http ok",
		Timestamp: now,
		Group:     "group",
		Service:   "service2",
		Check:     "check",
	}

	t.Run("alloc does not exist", func(t *testing.T) {
		request := cstructs.AllocChecksRequest{AllocID: "d3e34248-4843-be75-d4fd-4899975cfb38"}
		var response cstructs.AllocChecksResponse
		err := client.ClientRPC("Allocations.Checks", &request, &response)
		must.EqError(t, err, `Unknown allocation "d3e34248-4843-be75-d4fd-4899975cfb38"`)
	})

	t.Run("no checks for alloc", func(t *testing.T) {
		alloc := mock.Alloc()
		must.NoError(t, client.addAlloc(alloc, ""))

		request := cstructs.AllocChecksRequest{AllocID: alloc.ID}
		var response cstructs.AllocChecksResponse
		err := client.ClientRPC("Allocations.Checks", &request, &response)
		must.NoError(t, err)
		must.MapEmpty(t, response.Results)
	})

	t.Run("two in one alloc", func(t *testing.T) {
		alloc := mock.Alloc()
		must.NoError(t, client.addAlloc(alloc, ""))
		must.NoError(t, client.checkStore.Set(alloc.ID, qr1))
		must.NoError(t, client.checkStore.Set(alloc.ID, qr2))

		request := cstructs.AllocChecksRequest{AllocID: alloc.ID}
		var response cstructs.AllocChecksResponse
		err := client.ClientRPC("Allocations.Checks", &request, &response)
		must.NoError(t, err)
		must.MapEq(t, map[nstructs.CheckID]*nstructs.CheckQueryResult{
			"abc123": qr1,
			"def456": qr2,
		}, response.Results)
	})

	t.Run("ignore unrelated alloc", func(t *testing.T) {
		alloc1 := mock.Alloc()
		must.NoError(t, client.addAlloc(alloc1, ""))

		alloc2 := mock.Alloc()
		must.NoError(t, client.addAlloc(alloc2, ""))
		must.NoError(t, client.checkStore.Set(alloc1.ID, qr1))
		must.NoError(t, client.checkStore.Set(alloc2.ID, qr2))

		request := cstructs.AllocChecksRequest{AllocID: alloc1.ID}
		var response cstructs.AllocChecksResponse
		err := client.ClientRPC("Allocations.Checks", &request, &response)
		must.NoError(t, err)
		must.MapEq(t, map[nstructs.CheckID]*nstructs.CheckQueryResult{
			"abc123": qr1,
		}, response.Results)
	})
}

func TestAlloc_ExecStreaming(t *testing.T) {
	ci.Parallel(t)
	require := require.New(t)

	// Start a server and client
	s, cleanupS := nomad.TestServer(t, nil)
	defer cleanupS()
	testutil.WaitForLeader(t, s.RPC)

	c, cleanupC := TestClient(t, func(c *config.Config) {
		c.Servers = []string{s.GetConfig().RPCAddr.String()}
	})
	defer cleanupC()

	expectedStdout := "Hello from the other side\n"
	expectedStderr := "Hello from the other side\n"
	job := mock.BatchJob()
	job.TaskGroups[0].Count = 1
	job.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
		"run_for": "20s",
		"exec_command": map[string]interface{}{
			"run_for":       "1ms",
			"stdout_string": expectedStdout,
			"stderr_string": expectedStderr,
			"exit_code":     3,
		},
	}

	// Wait for client to be running job
	testutil.WaitForRunning(t, s.RPC, job)

	// Get the allocation ID
	args := nstructs.AllocListRequest{}
	args.Region = "global"
	resp := nstructs.AllocListResponse{}
	require.NoError(s.RPC("Alloc.List", &args, &resp))
	require.Len(resp.Allocations, 1)
	allocID := resp.Allocations[0].ID

	// Make the request
	req := &cstructs.AllocExecRequest{
		AllocID:      allocID,
		Task:         job.TaskGroups[0].Tasks[0].Name,
		Tty:          true,
		Cmd:          []string{"placeholder command"},
		QueryOptions: nstructs.QueryOptions{Region: "global"},
	}

	// Get the handler
	handler, err := c.StreamingRpcHandler("Allocations.Exec")
	require.Nil(err)

	// Create a pipe
	p1, p2 := net.Pipe()
	defer p1.Close()
	defer p2.Close()

	errCh := make(chan error)
	frames := make(chan *drivers.ExecTaskStreamingResponseMsg)

	// Start the handler
	go handler(p2)
	go decodeFrames(t, p1, frames, errCh)

	// Send the request
	encoder := codec.NewEncoder(p1, nstructs.MsgpackHandle)
	require.Nil(encoder.Encode(req))

	timeout := time.After(3 * time.Second)

	exitCode := -1
	receivedStdout := ""
	receivedStderr := ""

OUTER:
	for {
		select {
		case <-timeout:
			// time out report
			require.Equal(expectedStdout, receivedStderr, "didn't receive expected stdout")
			require.Equal(expectedStderr, receivedStderr, "didn't receive expected stderr")
			require.Equal(3, exitCode, "failed to get exit code")
			require.FailNow("timed out")
		case err := <-errCh:
			require.NoError(err)
		case f := <-frames:
			switch {
			case f.Stdout != nil && len(f.Stdout.Data) != 0:
				receivedStdout += string(f.Stdout.Data)
			case f.Stderr != nil && len(f.Stderr.Data) != 0:
				receivedStderr += string(f.Stderr.Data)
			case f.Exited && f.Result != nil:
				exitCode = int(f.Result.ExitCode)
			default:
				t.Logf("received unrelevant frame: %v", f)
			}

			if expectedStdout == receivedStdout && expectedStderr == receivedStderr && exitCode == 3 {
				break OUTER
			}
		}
	}
}

func TestAlloc_ExecStreaming_NoAllocation(t *testing.T) {
	ci.Parallel(t)
	require := require.New(t)

	// Start a server and client
	s, cleanupS := nomad.TestServer(t, nil)
	defer cleanupS()
	testutil.WaitForLeader(t, s.RPC)

	c, cleanupC := TestClient(t, func(c *config.Config) {
		c.Servers = []string{s.GetConfig().RPCAddr.String()}
	})
	defer cleanupC()

	// Make the request
	req := &cstructs.AllocExecRequest{
		AllocID:      uuid.Generate(),
		Task:         "testtask",
		Tty:          true,
		Cmd:          []string{"placeholder command"},
		QueryOptions: nstructs.QueryOptions{Region: "global"},
	}

	// Get the handler
	handler, err := c.StreamingRpcHandler("Allocations.Exec")
	require.Nil(err)

	// Create a pipe
	p1, p2 := net.Pipe()
	defer p1.Close()
	defer p2.Close()

	errCh := make(chan error)
	frames := make(chan *drivers.ExecTaskStreamingResponseMsg)

	// Start the handler
	go handler(p2)
	go decodeFrames(t, p1, frames, errCh)

	// Send the request
	encoder := codec.NewEncoder(p1, nstructs.MsgpackHandle)
	require.Nil(encoder.Encode(req))

	timeout := time.After(3 * time.Second)

	select {
	case <-timeout:
		require.FailNow("timed out")
	case err := <-errCh:
		require.True(nstructs.IsErrUnknownAllocation(err), "expected no allocation error but found: %v", err)
	case f := <-frames:
		require.Fail("received unexpected frame", "frame: %#v", f)
	}
}

func TestAlloc_ExecStreaming_DisableRemoteExec(t *testing.T) {
	ci.Parallel(t)
	require := require.New(t)

	// Start a server and client
	s, cleanupS := nomad.TestServer(t, nil)
	defer cleanupS()
	testutil.WaitForLeader(t, s.RPC)

	c, cleanupC := TestClient(t, func(c *config.Config) {
		c.Servers = []string{s.GetConfig().RPCAddr.String()}
		c.DisableRemoteExec = true
	})
	defer cleanupC()

	// Make the request
	req := &cstructs.AllocExecRequest{
		AllocID:      uuid.Generate(),
		Task:         "testtask",
		Tty:          true,
		Cmd:          []string{"placeholder command"},
		QueryOptions: nstructs.QueryOptions{Region: "global"},
	}

	// Get the handler
	handler, err := c.StreamingRpcHandler("Allocations.Exec")
	require.Nil(err)

	// Create a pipe
	p1, p2 := net.Pipe()
	defer p1.Close()
	defer p2.Close()

	errCh := make(chan error)
	frames := make(chan *drivers.ExecTaskStreamingResponseMsg)

	// Start the handler
	go handler(p2)
	go decodeFrames(t, p1, frames, errCh)

	// Send the request
	encoder := codec.NewEncoder(p1, nstructs.MsgpackHandle)
	require.Nil(encoder.Encode(req))

	timeout := time.After(3 * time.Second)

	select {
	case <-timeout:
		require.FailNow("timed out")
	case err := <-errCh:
		require.True(nstructs.IsErrPermissionDenied(err), "expected permission denied error but found: %v", err)
	case f := <-frames:
		require.Fail("received unexpected frame", "frame: %#v", f)
	}
}

func TestAlloc_ExecStreaming_ACL_Basic(t *testing.T) {
	ci.Parallel(t)

	// Start a server and client
	s, root, cleanupS := nomad.TestACLServer(t, nil)
	defer cleanupS()
	testutil.WaitForLeader(t, s.RPC)

	client, cleanupC := TestClient(t, func(c *config.Config) {
		c.ACLEnabled = true
		c.Servers = []string{s.GetConfig().RPCAddr.String()}
	})
	defer cleanupC()

	// Create a bad token
	policyBad := mock.NamespacePolicy("other", "", []string{acl.NamespaceCapabilityDeny})
	tokenBad := mock.CreatePolicyAndToken(t, s.State(), 1005, "invalid", policyBad)

	policyGood := mock.NamespacePolicy(nstructs.DefaultNamespace, "",
		[]string{acl.NamespaceCapabilityAllocExec, acl.NamespaceCapabilityReadFS})
	tokenGood := mock.CreatePolicyAndToken(t, s.State(), 1009, "valid2", policyGood)

	job := mock.BatchJob()
	job.TaskGroups[0].Count = 1
	job.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
		"run_for": "20s",
	}

	// Wait for client to be running job
	alloc := testutil.WaitForRunningWithToken(t, s.RPC, job, root.SecretID)[0]

	cases := []struct {
		Name          string
		Token         string
		ExpectedError string
	}{
		{
			Name:          "bad token",
			Token:         tokenBad.SecretID,
			ExpectedError: nstructs.ErrPermissionDenied.Error(),
		},
		{
			Name:          "good token",
			Token:         tokenGood.SecretID,
			ExpectedError: "task not found",
		},
		{
			Name:          "root token",
			Token:         root.SecretID,
			ExpectedError: "task not found",
		},
	}

	for _, c := range cases {
		t.Run(c.Name, func(t *testing.T) {

			// Make the request
			req := &cstructs.AllocExecRequest{
				AllocID: alloc.ID,
				Task:    "testtask",
				Tty:     true,
				Cmd:     []string{"placeholder command"},
				QueryOptions: nstructs.QueryOptions{
					Region:    "global",
					AuthToken: c.Token,
					Namespace: nstructs.DefaultNamespace,
				},
			}

			// Get the handler
			handler, err := client.StreamingRpcHandler("Allocations.Exec")
			require.Nil(t, err)

			// Create a pipe
			p1, p2 := net.Pipe()
			defer p1.Close()
			defer p2.Close()

			errCh := make(chan error)
			frames := make(chan *drivers.ExecTaskStreamingResponseMsg)

			// Start the handler
			go handler(p2)
			go decodeFrames(t, p1, frames, errCh)

			// Send the request
			encoder := codec.NewEncoder(p1, nstructs.MsgpackHandle)
			require.Nil(t, encoder.Encode(req))

			select {
			case <-time.After(3 * time.Second):
				require.FailNow(t, "timed out")
			case err := <-errCh:
				require.Contains(t, err.Error(), c.ExpectedError)
			case f := <-frames:
				require.Fail(t, "received unexpected frame", "frame: %#v", f)
			}
		})
	}
}

// TestAlloc_ExecStreaming_ACL_WithIsolation_Image asserts that token only needs
// alloc-exec acl policy when image isolation is used
func TestAlloc_ExecStreaming_ACL_WithIsolation_Image(t *testing.T) {
	ci.Parallel(t)
	isolation := drivers.FSIsolationImage

	// Start a server and client
	s, root, cleanupS := nomad.TestACLServer(t, nil)
	defer cleanupS()
	testutil.WaitForLeader(t, s.RPC)

	client, cleanupC := TestClient(t, func(c *config.Config) {
		c.ACLEnabled = true
		c.Servers = []string{s.GetConfig().RPCAddr.String()}

		pluginConfig := []*nconfig.PluginConfig{
			{
				Name: "mock_driver",
				Config: map[string]interface{}{
					"fs_isolation": string(isolation),
				},
			},
		}

		c.PluginLoader = catalog.TestPluginLoaderWithOptions(t, "", map[string]string{}, pluginConfig)
	})
	defer cleanupC()

	// Create a bad token
	policyBad := mock.NamespacePolicy("other", "", []string{acl.NamespaceCapabilityDeny})
	tokenBad := mock.CreatePolicyAndToken(t, s.State(), 1005, "invalid", policyBad)

	policyAllocExec := mock.NamespacePolicy(nstructs.DefaultNamespace, "",
		[]string{acl.NamespaceCapabilityAllocExec})
	tokenAllocExec := mock.CreatePolicyAndToken(t, s.State(), 1009, "valid2", policyAllocExec)

	policyAllocNodeExec := mock.NamespacePolicy(nstructs.DefaultNamespace, "",
		[]string{acl.NamespaceCapabilityAllocExec, acl.NamespaceCapabilityAllocNodeExec})
	tokenAllocNodeExec := mock.CreatePolicyAndToken(t, s.State(), 1009, "valid2", policyAllocNodeExec)

	job := mock.BatchJob()
	job.TaskGroups[0].Count = 1
	job.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
		"run_for": "20s",
		"exec_command": map[string]interface{}{
			"run_for":       "1ms",
			"stdout_string": "some output",
		},
	}

	// Wait for client to be running job
	testutil.WaitForRunningWithToken(t, s.RPC, job, root.SecretID)

	// Get the allocation ID
	args := nstructs.AllocListRequest{}
	args.Region = "global"
	args.AuthToken = root.SecretID
	args.Namespace = nstructs.DefaultNamespace
	resp := nstructs.AllocListResponse{}
	require.NoError(t, s.RPC("Alloc.List", &args, &resp))
	require.Len(t, resp.Allocations, 1)
	allocID := resp.Allocations[0].ID

	cases := []struct {
		Name          string
		Token         string
		ExpectedError string
	}{
		{
			Name:          "bad token",
			Token:         tokenBad.SecretID,
			ExpectedError: nstructs.ErrPermissionDenied.Error(),
		},
		{
			Name:          "alloc-exec token",
			Token:         tokenAllocExec.SecretID,
			ExpectedError: "",
		},
		{
			Name:          "alloc-node-exec token",
			Token:         tokenAllocNodeExec.SecretID,
			ExpectedError: "",
		},
		{
			Name:          "root token",
			Token:         root.SecretID,
			ExpectedError: "",
		},
	}

	for _, c := range cases {
		t.Run(c.Name, func(t *testing.T) {

			// Make the request
			req := &cstructs.AllocExecRequest{
				AllocID: allocID,
				Task:    job.TaskGroups[0].Tasks[0].Name,
				Tty:     true,
				Cmd:     []string{"placeholder command"},
				QueryOptions: nstructs.QueryOptions{
					Region:    "global",
					AuthToken: c.Token,
					Namespace: nstructs.DefaultNamespace,
				},
			}

			// Get the handler
			handler, err := client.StreamingRpcHandler("Allocations.Exec")
			require.Nil(t, err)

			// Create a pipe
			p1, p2 := net.Pipe()
			defer p1.Close()
			defer p2.Close()

			errCh := make(chan error)
			frames := make(chan *drivers.ExecTaskStreamingResponseMsg)

			// Start the handler
			go handler(p2)
			go decodeFrames(t, p1, frames, errCh)

			// Send the request
			encoder := codec.NewEncoder(p1, nstructs.MsgpackHandle)
			require.Nil(t, encoder.Encode(req))

			select {
			case <-time.After(3 * time.Second):
			case err := <-errCh:
				if c.ExpectedError == "" {
					require.NoError(t, err)
				} else {
					require.Contains(t, err.Error(), c.ExpectedError)
				}
			case f := <-frames:
				// we are good if we don't expect an error
				if c.ExpectedError != "" {
					require.Fail(t, "unexpected frame", "frame: %#v", f)
				}
			}
		})
	}
}

// TestAlloc_ExecStreaming_ACL_WithIsolation_Chroot asserts that token only needs
// alloc-exec acl policy when chroot isolation is used
func TestAlloc_ExecStreaming_ACL_WithIsolation_Chroot(t *testing.T) {
	ci.SkipSlow(t, "flaky on GHA; too much disk IO")
	ci.Parallel(t)

	if runtime.GOOS != "linux" || unix.Geteuid() != 0 {
		t.Skip("chroot isolation requires linux root")
	}

	isolation := drivers.FSIsolationChroot

	// Start a server and client
	s, root, cleanupS := nomad.TestACLServer(t, nil)
	defer cleanupS()
	testutil.WaitForLeader(t, s.RPC)

	client, cleanup := TestClient(t, func(c *config.Config) {
		c.ACLEnabled = true
		c.Servers = []string{s.GetConfig().RPCAddr.String()}

		pluginConfig := []*nconfig.PluginConfig{
			{
				Name: "mock_driver",
				Config: map[string]interface{}{
					"fs_isolation": string(isolation),
				},
			},
		}

		c.PluginLoader = catalog.TestPluginLoaderWithOptions(t, "", map[string]string{}, pluginConfig)
	})
	defer cleanup()

	// Create a bad token
	policyBad := mock.NamespacePolicy("other", "", []string{acl.NamespaceCapabilityDeny})
	tokenBad := mock.CreatePolicyAndToken(t, s.State(), 1005, "invalid", policyBad)

	policyAllocExec := mock.NamespacePolicy(nstructs.DefaultNamespace, "",
		[]string{acl.NamespaceCapabilityAllocExec})
	tokenAllocExec := mock.CreatePolicyAndToken(t, s.State(), 1009, "alloc-exec", policyAllocExec)

	policyAllocNodeExec := mock.NamespacePolicy(nstructs.DefaultNamespace, "",
		[]string{acl.NamespaceCapabilityAllocExec, acl.NamespaceCapabilityAllocNodeExec})
	tokenAllocNodeExec := mock.CreatePolicyAndToken(t, s.State(), 1009, "alloc-node-exec", policyAllocNodeExec)

	job := mock.BatchJob()
	job.TaskGroups[0].Count = 1
	job.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
		"run_for": "20s",
		"exec_command": map[string]interface{}{
			"run_for":       "1ms",
			"stdout_string": "some output",
		},
	}

	// Wait for client to be running job
	testutil.WaitForRunningWithToken(t, s.RPC, job, root.SecretID)

	// Get the allocation ID
	args := nstructs.AllocListRequest{}
	args.Region = "global"
	args.AuthToken = root.SecretID
	args.Namespace = nstructs.DefaultNamespace
	resp := nstructs.AllocListResponse{}
	require.NoError(t, s.RPC("Alloc.List", &args, &resp))
	require.Len(t, resp.Allocations, 1)
	allocID := resp.Allocations[0].ID

	cases := []struct {
		Name          string
		Token         string
		ExpectedError string
	}{
		{
			Name:          "bad token",
			Token:         tokenBad.SecretID,
			ExpectedError: nstructs.ErrPermissionDenied.Error(),
		},
		{
			Name:          "alloc-exec token",
			Token:         tokenAllocExec.SecretID,
			ExpectedError: "",
		},
		{
			Name:          "alloc-node-exec token",
			Token:         tokenAllocNodeExec.SecretID,
			ExpectedError: "",
		},
		{
			Name:          "root token",
			Token:         root.SecretID,
			ExpectedError: "",
		},
	}

	for _, c := range cases {
		t.Run(c.Name, func(t *testing.T) {

			// Make the request
			req := &cstructs.AllocExecRequest{
				AllocID: allocID,
				Task:    job.TaskGroups[0].Tasks[0].Name,
				Tty:     true,
				Cmd:     []string{"placeholder command"},
				QueryOptions: nstructs.QueryOptions{
					Region:    "global",
					AuthToken: c.Token,
					Namespace: nstructs.DefaultNamespace,
				},
			}

			// Get the handler
			handler, err := client.StreamingRpcHandler("Allocations.Exec")
			require.Nil(t, err)

			// Create a pipe
			p1, p2 := net.Pipe()
			defer p1.Close()
			defer p2.Close()

			errCh := make(chan error)
			frames := make(chan *drivers.ExecTaskStreamingResponseMsg)

			// Start the handler
			go handler(p2)
			go decodeFrames(t, p1, frames, errCh)

			// Send the request
			encoder := codec.NewEncoder(p1, nstructs.MsgpackHandle)
			require.Nil(t, encoder.Encode(req))

			select {
			case <-time.After(3 * time.Second):
			case err := <-errCh:
				if c.ExpectedError == "" {
					require.NoError(t, err)
				} else {
					require.Contains(t, err.Error(), c.ExpectedError)
				}
			case f := <-frames:
				// we are good if we don't expect an error
				if c.ExpectedError != "" {
					require.Fail(t, "unexpected frame", "frame: %#v", f)
				}
			}
		})
	}
}

// TestAlloc_ExecStreaming_ACL_WithIsolation_None asserts that token needs
// alloc-node-exec acl policy as well when no isolation is used
func TestAlloc_ExecStreaming_ACL_WithIsolation_None(t *testing.T) {
	ci.Parallel(t)
	isolation := drivers.FSIsolationNone

	// Start a server and client
	s, root, cleanupS := nomad.TestACLServer(t, nil)
	defer cleanupS()
	testutil.WaitForLeader(t, s.RPC)

	client, cleanup := TestClient(t, func(c *config.Config) {
		c.ACLEnabled = true
		c.Servers = []string{s.GetConfig().RPCAddr.String()}

		pluginConfig := []*nconfig.PluginConfig{
			{
				Name: "mock_driver",
				Config: map[string]interface{}{
					"fs_isolation": string(isolation),
				},
			},
		}

		c.PluginLoader = catalog.TestPluginLoaderWithOptions(t, "", map[string]string{}, pluginConfig)
	})
	defer cleanup()

	// Create a bad token
	policyBad := mock.NamespacePolicy("other", "", []string{acl.NamespaceCapabilityDeny})
	tokenBad := mock.CreatePolicyAndToken(t, s.State(), 1005, "invalid", policyBad)

	policyAllocExec := mock.NamespacePolicy(nstructs.DefaultNamespace, "",
		[]string{acl.NamespaceCapabilityAllocExec})
	tokenAllocExec := mock.CreatePolicyAndToken(t, s.State(), 1009, "alloc-exec", policyAllocExec)

	policyAllocNodeExec := mock.NamespacePolicy(nstructs.DefaultNamespace, "",
		[]string{acl.NamespaceCapabilityAllocExec, acl.NamespaceCapabilityAllocNodeExec})
	tokenAllocNodeExec := mock.CreatePolicyAndToken(t, s.State(), 1009, "alloc-node-exec", policyAllocNodeExec)

	job := mock.BatchJob()
	job.TaskGroups[0].Count = 1
	job.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
		"run_for": "20s",
		"exec_command": map[string]interface{}{
			"run_for":       "1ms",
			"stdout_string": "some output",
		},
	}

	// Wait for client to be running job
	testutil.WaitForRunningWithToken(t, s.RPC, job, root.SecretID)

	// Get the allocation ID
	args := nstructs.AllocListRequest{}
	args.Region = "global"
	args.AuthToken = root.SecretID
	args.Namespace = nstructs.DefaultNamespace
	resp := nstructs.AllocListResponse{}
	require.NoError(t, s.RPC("Alloc.List", &args, &resp))
	require.Len(t, resp.Allocations, 1)
	allocID := resp.Allocations[0].ID

	cases := []struct {
		Name          string
		Token         string
		ExpectedError string
	}{
		{
			Name:          "bad token",
			Token:         tokenBad.SecretID,
			ExpectedError: nstructs.ErrPermissionDenied.Error(),
		},
		{
			Name:          "alloc-exec token",
			Token:         tokenAllocExec.SecretID,
			ExpectedError: nstructs.ErrPermissionDenied.Error(),
		},
		{
			Name:          "alloc-node-exec token",
			Token:         tokenAllocNodeExec.SecretID,
			ExpectedError: "",
		},
		{
			Name:          "root token",
			Token:         root.SecretID,
			ExpectedError: "",
		},
	}

	for _, c := range cases {
		t.Run(c.Name, func(t *testing.T) {

			// Make the request
			req := &cstructs.AllocExecRequest{
				AllocID: allocID,
				Task:    job.TaskGroups[0].Tasks[0].Name,
				Tty:     true,
				Cmd:     []string{"placeholder command"},
				QueryOptions: nstructs.QueryOptions{
					Region:    "global",
					AuthToken: c.Token,
					Namespace: nstructs.DefaultNamespace,
				},
			}

			// Get the handler
			handler, err := client.StreamingRpcHandler("Allocations.Exec")
			require.Nil(t, err)

			// Create a pipe
			p1, p2 := net.Pipe()
			defer p1.Close()
			defer p2.Close()

			errCh := make(chan error)
			frames := make(chan *drivers.ExecTaskStreamingResponseMsg)

			// Start the handler
			go handler(p2)
			go decodeFrames(t, p1, frames, errCh)

			// Send the request
			encoder := codec.NewEncoder(p1, nstructs.MsgpackHandle)
			require.Nil(t, encoder.Encode(req))

			select {
			case <-time.After(3 * time.Second):
			case err := <-errCh:
				if c.ExpectedError == "" {
					require.NoError(t, err)
				} else {
					require.Contains(t, err.Error(), c.ExpectedError)
				}
			case f := <-frames:
				// we are good if we don't expect an error
				if c.ExpectedError != "" {
					require.Fail(t, "unexpected frame", "frame: %#v", f)
				}
			}
		})
	}
}

func decodeFrames(t *testing.T, p1 net.Conn, frames chan<- *drivers.ExecTaskStreamingResponseMsg, errCh chan<- error) {
	// Start the decoder
	decoder := codec.NewDecoder(p1, nstructs.MsgpackHandle)

	for {
		var msg cstructs.StreamErrWrapper
		if err := decoder.Decode(&msg); err != nil {
			if err == io.EOF || strings.Contains(err.Error(), "closed") {
				return
			}
			t.Logf("received error decoding: %#v", err)

			errCh <- fmt.Errorf("error decoding: %v", err)
			return
		}

		if msg.Error != nil {
			errCh <- msg.Error
			continue
		}

		var frame drivers.ExecTaskStreamingResponseMsg
		if err := json.Unmarshal(msg.Payload, &frame); err != nil {
			errCh <- err
			return
		}
		t.Logf("received message: %#v", msg)
		frames <- &frame
	}
}