open-nomad/e2e/e2eutil/node.go

package e2eutil

import (
	"fmt"
	"os"
	"path/filepath"
	"time"

	"github.com/hashicorp/nomad/api"
	"github.com/hashicorp/nomad/testutil"
)

// AgentDisconnect is a test helper function that runs a raw_exec job
// that will disconnect a client at the network level and reconnect it
// after the specified period of time.
//
// Returns once the job is registered with the job ID of the restart
// job and any registration errors, not after the duration, so that
// callers can take actions while the client is down.
func AgentDisconnect(nodeID string, after time.Duration) (string, error) {
	jobID := "disconnect-" + nodeID
	vars := []string{"-var", "nodeID=" + nodeID}
	if after > 0 {
		vars = append(vars, "-var", fmt.Sprintf("time=%d", int(after.Seconds())))
	}

	jobFilePath := "../e2eutil/input/disconnect-node.nomad"

	// TODO: temporary hack around having older tests running on the
	// framework vs new tests not, as the framework has a different
	// working directory
	dir, err := os.Getwd()
	if err != nil {
		return "", err
	}
	if filepath.Base(dir) == "e2e" {
		jobFilePath = "e2eutil/input/disconnect-node.nomad"
	}

	err = RegisterWithArgs(jobID, jobFilePath, vars...)
	return jobID, err
}

// AgentRestartAfter is a test helper function that runs a raw_exec
// job that will stop a client and restart it after the specified
// period of time. The node must be running under systemd.
//
// Returns once the job is registered with the job ID of the restart
// job and any registration errors, not after the duration, so that
// callers can take actions while the client is down.
func AgentRestartAfter(nodeID string, after time.Duration) (string, error) {
	jobID := "restart-" + nodeID
	vars := []string{"-var", "nodeID=" + nodeID}
	if after > 0 {
		vars = append(vars, "-var", fmt.Sprintf("time=%d", int(after.Seconds())))
	}

	jobFilePath := "../e2eutil/input/restart-node.nomad"

	// TODO: temporary hack around having older tests running on the
	// framework vs new tests not, as the framework has a different
	// working directory
	dir, err := os.Getwd()
	if err != nil {
		return "", err
	}
	if filepath.Base(dir) == "e2e" {
		jobFilePath = "e2eutil/input/restart-node.nomad"
	}

	err = RegisterWithArgs(jobID, jobFilePath, vars...)
	return jobID, err
}

// AgentRestart is a test helper function that restarts a client node
// running under systemd using a raw_exec job. Returns the job ID of
// the restart job so that callers can clean it up.
func AgentRestart(client *api.Client, nodeID string) (string, error) {

	jobID, err := AgentRestartAfter(nodeID, 0)
	if err != nil {
		return jobID, err
	}

	reasonErr := fmt.Errorf("timed out")
	retries := 30
	for retries > 0 {
		time.Sleep(1 * time.Second)
		retries--

		allocStubs, _, err := client.Jobs().Allocations(jobID, true, nil)
		if err != nil {
			reasonErr = err
			continue
		}

		if len(allocStubs) > 0 {
		INNER:
			for _, state := range allocStubs[0].TaskStates {
				if state.State == "dead" {
					node, _, err := client.Nodes().Info(nodeID, nil)
					if err != nil {
						reasonErr = err
						break INNER
					}
					if node != nil && node.Status == "ready" {
						return jobID, nil
					}
					reasonErr = fmt.Errorf("node status not ready")
				}
			}
		}
	}
	return jobID, fmt.Errorf("node did not become ready: %v", reasonErr)
}

// ListWindowsClientNodes returns a list of Windows client IDs, so that tests
// can skip operating-specific tests if there are no Windows clients available.
// Returns an error only on client errors.
func ListWindowsClientNodes(client *api.Client) ([]string, error) {
	return listClientNodesByOS(client, "windows")
}

// ListLinuxClientNodes returns a list of Linux client IDs, so that tests
// can skip operating-specific tests if there are no Linux clients available
// Returns an error only on client errors.
func ListLinuxClientNodes(client *api.Client) ([]string, error) {
	return listClientNodesByOS(client, "linux")
}

func listClientNodesByOS(client *api.Client, osName string) ([]string, error) {
	nodeIDs := []string{}
	nodes, _, err := client.Nodes().List(&api.QueryOptions{})
	if err != nil {
		return nodeIDs, fmt.Errorf("could not query nodes: %v", err)
	}
	for _, stubNode := range nodes {
		node, _, err := client.Nodes().Info(stubNode.ID, nil)
		if err != nil {
			return nodeIDs, fmt.Errorf("could not query nodes: %v", err)
		}
		if name, ok := node.Attributes["kernel.name"]; ok && name == osName {
			nodeIDs = append(nodeIDs, stubNode.ID)
		}
	}
	return nodeIDs, nil
}

func NodeStatusList() ([]map[string]string, error) {

	out, err := Command("nomad", "node", "status", "-verbose")
	if err != nil {
		return nil, fmt.Errorf("'nomad node status' failed: %w", err)
	}

	nodes, err := ParseColumns(out)
	if err != nil {
		return nil, fmt.Errorf("could not parse node status output: %w", err)
	}
	return nodes, nil
}

func NodeStatusListFiltered(filterFn func(string) bool) ([]map[string]string, error) {

	out, err := Command("nomad", "node", "status", "-verbose")
	if err != nil {
		return nil, fmt.Errorf("'nomad node status' failed: %w", err)
	}

	allNodes, err := ParseColumns(out)
	if err != nil {
		return nil, fmt.Errorf("could not parse node status output: %w", err)
	}
	nodes := []map[string]string{}

	for _, node := range allNodes {
		out, err := Command("nomad", "node", "status", "-verbose", node["ID"])
		if err != nil {
			return nil, fmt.Errorf("could not node status output: %w", err)
		}
		if filterFn(out) {
			nodes = append(nodes, node)
		}
	}

	return nodes, nil
}

func WaitForNodeStatus(nodeID, status string, wc *WaitConfig) error {
	var got string
	var err error
	interval, retries := wc.OrDefault()
	testutil.WaitForResultRetries(retries, func() (bool, error) {
		time.Sleep(interval)

		nodeStatuses, err := NodeStatusList()
		if err != nil {
			return false, err
		}
		for _, nodeStatus := range nodeStatuses {
			if nodeStatus["ID"] == nodeID {
				got = nodeStatus["Status"]
				if got == status {
					return true, nil
				}
			}
		}
		return false, nil
	}, func(e error) {
		err = fmt.Errorf("node status check failed: got %#v", got)
	})
	return err
}
e2e: test infra for client node restarts (#6313) Add a test helper that restarts a specific client node running under systemd using a `raw_exec` job. 2019-09-18 14:10:14 +00:00			`package e2eutil`

			`import (`
			`"fmt"`
E2E disconnected clients test refactor (#12402) * Wait longer for node to go down in disconnected clients test. The existing helper only waits 10s, but there's a jitter on heartbeats that we need to account for. Wait for 30s for node to go down to give us plenty of room * Port disconnected clients to stdlib-style test 2022-03-30 13:12:44 +00:00			`"os"`
			`"path/filepath"`
e2e: test infra for client node restarts (#6313) Add a test helper that restarts a specific client node running under systemd using a `raw_exec` job. 2019-09-18 14:10:14 +00:00			`"time"`

			`"github.com/hashicorp/nomad/api"`
e2e: test for allocations replacement on disconnected clients (#12375) This test exercises the behavior of clients that become disconnected and have their allocations replaced. Future test cases will exercise the `max_client_disconnect` field on the job spec. 2022-03-25 16:26:43 +00:00			`"github.com/hashicorp/nomad/testutil"`
e2e: test infra for client node restarts (#6313) Add a test helper that restarts a specific client node running under systemd using a `raw_exec` job. 2019-09-18 14:10:14 +00:00			`)`

E2E: test for nodes disconnected by netsplit (#12407) 2022-04-11 15:34:27 +00:00			`// AgentDisconnect is a test helper function that runs a raw_exec job`
			`// that will disconnect a client at the network level and reconnect it`
			`// after the specified period of time.`
			`//`
			`// Returns once the job is registered with the job ID of the restart`
			`// job and any registration errors, not after the duration, so that`
			`// callers can take actions while the client is down.`
			`func AgentDisconnect(nodeID string, after time.Duration) (string, error) {`
			`jobID := "disconnect-" + nodeID`
			`vars := []string{"-var", "nodeID=" + nodeID}`
			`if after > 0 {`
			`vars = append(vars, "-var", fmt.Sprintf("time=%d", int(after.Seconds())))`
			`}`

			`jobFilePath := "../e2eutil/input/disconnect-node.nomad"`

			`// TODO: temporary hack around having older tests running on the`
			`// framework vs new tests not, as the framework has a different`
			`// working directory`
			`dir, err := os.Getwd()`
			`if err != nil {`
			`return "", err`
			`}`
			`if filepath.Base(dir) == "e2e" {`
			`jobFilePath = "e2eutil/input/disconnect-node.nomad"`
			`}`

			`err = RegisterWithArgs(jobID, jobFilePath, vars...)`
			`return jobID, err`
			`}`

e2e: test for allocations replacement on disconnected clients (#12375) This test exercises the behavior of clients that become disconnected and have their allocations replaced. Future test cases will exercise the `max_client_disconnect` field on the job spec. 2022-03-25 16:26:43 +00:00			`// AgentRestartAfter is a test helper function that runs a raw_exec`
			`// job that will stop a client and restart it after the specified`
			`// period of time. The node must be running under systemd.`
			`//`
			`// Returns once the job is registered with the job ID of the restart`
			`// job and any registration errors, not after the duration, so that`
			`// callers can take actions while the client is down.`
			`func AgentRestartAfter(nodeID string, after time.Duration) (string, error) {`
			`jobID := "restart-" + nodeID`
			`vars := []string{"-var", "nodeID=" + nodeID}`
			`if after > 0 {`
			`vars = append(vars, "-var", fmt.Sprintf("time=%d", int(after.Seconds())))`
			`}`

E2E disconnected clients test refactor (#12402) * Wait longer for node to go down in disconnected clients test. The existing helper only waits 10s, but there's a jitter on heartbeats that we need to account for. Wait for 30s for node to go down to give us plenty of room * Port disconnected clients to stdlib-style test 2022-03-30 13:12:44 +00:00			`jobFilePath := "../e2eutil/input/restart-node.nomad"`

			`// TODO: temporary hack around having older tests running on the`
			`// framework vs new tests not, as the framework has a different`
			`// working directory`
			`dir, err := os.Getwd()`
			`if err != nil {`
			`return "", err`
			`}`
			`if filepath.Base(dir) == "e2e" {`
			`jobFilePath = "e2eutil/input/restart-node.nomad"`
			`}`

			`err = RegisterWithArgs(jobID, jobFilePath, vars...)`
e2e: test for allocations replacement on disconnected clients (#12375) This test exercises the behavior of clients that become disconnected and have their allocations replaced. Future test cases will exercise the `max_client_disconnect` field on the job spec. 2022-03-25 16:26:43 +00:00			`return jobID, err`
			`}`

e2e: test infra for client node restarts (#6313) Add a test helper that restarts a specific client node running under systemd using a `raw_exec` job. 2019-09-18 14:10:14 +00:00			`// AgentRestart is a test helper function that restarts a client node`
			`// running under systemd using a raw_exec job. Returns the job ID of`
			`// the restart job so that callers can clean it up.`
			`func AgentRestart(client *api.Client, nodeID string) (string, error) {`

e2e: test for allocations replacement on disconnected clients (#12375) This test exercises the behavior of clients that become disconnected and have their allocations replaced. Future test cases will exercise the `max_client_disconnect` field on the job spec. 2022-03-25 16:26:43 +00:00			`jobID, err := AgentRestartAfter(nodeID, 0)`
e2e: test infra for client node restarts (#6313) Add a test helper that restarts a specific client node running under systemd using a `raw_exec` job. 2019-09-18 14:10:14 +00:00			`if err != nil {`
			`return jobID, err`
			`}`

			`reasonErr := fmt.Errorf("timed out")`
			`retries := 30`
			`for retries > 0 {`
			`time.Sleep(1 * time.Second)`
			`retries--`

			`allocStubs, _, err := client.Jobs().Allocations(jobID, true, nil)`
			`if err != nil {`
			`reasonErr = err`
			`continue`
			`}`

			`if len(allocStubs) > 0 {`
			`INNER:`
			`for _, state := range allocStubs[0].TaskStates {`
			`if state.State == "dead" {`
			`node, _, err := client.Nodes().Info(nodeID, nil)`
			`if err != nil {`
			`reasonErr = err`
			`break INNER`
			`}`
			`if node != nil && node.Status == "ready" {`
			`return jobID, nil`
			`}`
			`reasonErr = fmt.Errorf("node status not ready")`
			`}`
			`}`
			`}`
			`}`
			`return jobID, fmt.Errorf("node did not become ready: %v", reasonErr)`
			`}`

e2e: add allocstats test for Windows (#6775) Extends the BasicAllocStats test to include a test for Windows clients, exercising stats via a powershell `raw_exec` job. Adds `ListLinuxClientNodes` and `ListWindowsClientNodes` utils so that we can scope tests to run only when Linux or Windows clients are available. This prevents waiting on timeouts when running a subset of the tests against a development cluster (vs our nightly test cluster). 2019-11-26 13:05:42 +00:00			`// ListWindowsClientNodes returns a list of Windows client IDs, so that tests`
			`// can skip operating-specific tests if there are no Windows clients available.`
			`// Returns an error only on client errors.`
			`func ListWindowsClientNodes(client *api.Client) ([]string, error) {`
			`return listClientNodesByOS(client, "windows")`
			`}`

			`// ListLinuxClientNodes returns a list of Linux client IDs, so that tests`
			`// can skip operating-specific tests if there are no Linux clients available`
			`// Returns an error only on client errors.`
			`func ListLinuxClientNodes(client *api.Client) ([]string, error) {`
			`return listClientNodesByOS(client, "linux")`
			`}`

			`func listClientNodesByOS(client *api.Client, osName string) ([]string, error) {`
			`nodeIDs := []string{}`
			`nodes, _, err := client.Nodes().List(&api.QueryOptions{})`
			`if err != nil {`
			`return nodeIDs, fmt.Errorf("could not query nodes: %v", err)`
			`}`
			`for _, stubNode := range nodes {`
			`node, _, err := client.Nodes().Info(stubNode.ID, nil)`
			`if err != nil {`
			`return nodeIDs, fmt.Errorf("could not query nodes: %v", err)`
			`}`
			`if name, ok := node.Attributes["kernel.name"]; ok && name == osName {`
			`nodeIDs = append(nodeIDs, stubNode.ID)`
			`}`
			`}`
			`return nodeIDs, nil`
			`}`
e2e: refactor CLI utils out of rescheduling test (#8905) The CLI helpers in the rescheduling test were intended for shared use, but until some other tests were written we didn't want to waste time making them generic. This changeset refactors them and adds some new helpers associated with the node drain tests (under separate PR). 2020-09-16 20:10:06 +00:00
			`func NodeStatusList() ([]map[string]string, error) {`

			`out, err := Command("nomad", "node", "status", "-verbose")`
			`if err != nil {`
			`return nil, fmt.Errorf("'nomad node status' failed: %w", err)`
			`}`

			`nodes, err := ParseColumns(out)`
			`if err != nil {`
			`return nil, fmt.Errorf("could not parse node status output: %w", err)`
			`}`
			`return nodes, nil`
			`}`

			`func NodeStatusListFiltered(filterFn func(string) bool) ([]map[string]string, error) {`

			`out, err := Command("nomad", "node", "status", "-verbose")`
			`if err != nil {`
			`return nil, fmt.Errorf("'nomad node status' failed: %w", err)`
			`}`

			`allNodes, err := ParseColumns(out)`
			`if err != nil {`
			`return nil, fmt.Errorf("could not parse node status output: %w", err)`
			`}`
			`nodes := []map[string]string{}`

			`for _, node := range allNodes {`
			`out, err := Command("nomad", "node", "status", "-verbose", node["ID"])`
			`if err != nil {`
			`return nil, fmt.Errorf("could not node status output: %w", err)`
			`}`
			`if filterFn(out) {`
			`nodes = append(nodes, node)`
			`}`
			`}`

			`return nodes, nil`
			`}`
e2e: test for allocations replacement on disconnected clients (#12375) This test exercises the behavior of clients that become disconnected and have their allocations replaced. Future test cases will exercise the `max_client_disconnect` field on the job spec. 2022-03-25 16:26:43 +00:00
			`func WaitForNodeStatus(nodeID, status string, wc *WaitConfig) error {`
			`var got string`
			`var err error`
			`interval, retries := wc.OrDefault()`
			`testutil.WaitForResultRetries(retries, func() (bool, error) {`
			`time.Sleep(interval)`

			`nodeStatuses, err := NodeStatusList()`
			`if err != nil {`
			`return false, err`
			`}`
			`for _, nodeStatus := range nodeStatuses {`
			`if nodeStatus["ID"] == nodeID {`
			`got = nodeStatus["Status"]`
			`if got == status {`
			`return true, nil`
			`}`
			`}`
			`}`
			`return false, nil`
			`}, func(e error) {`
			`err = fmt.Errorf("node status check failed: got %#v", got)`
			`})`
			`return err`
			`}`