open-nomad/nomad/drainer/watch_nodes_test.go
Seth Hoenig ba728f8f97
api: enable support for setting original job source (#16763)
* api: enable support for setting original source alongside job

This PR adds support for setting job source material along with
the registration of a job.

This includes a new HTTP endpoint and a new RPC endpoint for
making queries for the original source of a job. The
HTTP endpoint is /v1/job/<id>/submission?version=<version> and
the RPC method is Job.GetJobSubmission.

The job source (if submitted, and doing so is always optional), is
stored in the job_submission memdb table, separately from the
actual job. This way we do not incur overhead of reading the large
string field throughout normal job operations.

The server config now includes job_max_source_size for configuring
the maximum size the job source may be, before the server simply
drops the source material. This should help prevent Bad Things from
happening when huge jobs are submitted. If the value is set to 0,
all job source material will be dropped.

* api: avoid writing var content to disk for parsing

* api: move submission validation into RPC layer

* api: return an error if updating a job submission without namespace or job id

* api: be exact about the job index we associate a submission with (modify)

* api: reword api docs scheduling

* api: prune all but the last 6 job submissions

* api: protect against nil job submission in job validation

* api: set max job source size in test server

* api: fixups from pr
2023-04-11 08:45:08 -05:00

279 lines
9.3 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package drainer
import (
"fmt"
"testing"
"time"
"github.com/shoenig/test/must"
"github.com/shoenig/test/wait"
"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/helper/pointer"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/state"
"github.com/hashicorp/nomad/nomad/structs"
)
// TestNodeDrainWatcher_AddNodes tests that new nodes are added to the node
// watcher and deadline notifier, but only if they have a drain spec.
func TestNodeDrainWatcher_AddNodes(t *testing.T) {
ci.Parallel(t)
_, store, tracker := testNodeDrainWatcher(t)
// Create two nodes, one draining and one not draining
n1, n2 := mock.Node(), mock.Node()
n2.DrainStrategy = &structs.DrainStrategy{
DrainSpec: structs.DrainSpec{
Deadline: time.Hour,
},
ForceDeadline: time.Now().Add(time.Hour),
}
// Create a job with a running alloc on each node
job := mock.Job()
jobID := structs.NamespacedID{Namespace: job.Namespace, ID: job.ID}
must.NoError(t, store.UpsertJob(structs.MsgTypeTestSetup, 101, nil, job))
alloc1 := mock.Alloc()
alloc1.JobID = job.ID
alloc1.Job = job
alloc1.TaskGroup = job.TaskGroups[0].Name
alloc1.NodeID = n1.ID
alloc1.DeploymentStatus = &structs.AllocDeploymentStatus{Healthy: pointer.Of(true)}
alloc2 := alloc1.Copy()
alloc2.ID = uuid.Generate()
alloc2.NodeID = n2.ID
must.NoError(t, store.UpsertAllocs(
structs.MsgTypeTestSetup, 102, []*structs.Allocation{alloc1, alloc2}))
must.NoError(t, store.UpsertNode(structs.MsgTypeTestSetup, 103, n1))
must.NoError(t, store.UpsertNode(structs.MsgTypeTestSetup, 104, n2))
// Only 1 node is draining, and the other should not be tracked
assertTrackerSettled(t, tracker, []string{n2.ID})
// Notifications should fire to the job watcher and deadline notifier
must.MapContainsKey(t, tracker.jobWatcher.(*MockJobWatcher).jobs, jobID)
must.MapContainsKey(t, tracker.deadlineNotifier.(*MockDeadlineNotifier).nodes, n2.ID)
}
// TestNodeDrainWatcher_Remove tests that when a node should no longer be
// tracked that we stop tracking it in the node watcher and deadline notifier.
func TestNodeDrainWatcher_Remove(t *testing.T) {
ci.Parallel(t)
_, store, tracker := testNodeDrainWatcher(t)
t.Run("stop drain", func(t *testing.T) {
n, _ := testNodeDrainWatcherSetup(t, store, tracker)
index, _ := store.LatestIndex()
must.NoError(t, store.UpdateNodeDrain(
structs.MsgTypeTestSetup, index+1, n.ID, nil, false, 0, nil, nil, ""))
// Node with stopped drain should no longer be tracked
assertTrackerSettled(t, tracker, []string{})
must.MapEmpty(t, tracker.deadlineNotifier.(*MockDeadlineNotifier).nodes)
})
t.Run("delete node", func(t *testing.T) {
n, _ := testNodeDrainWatcherSetup(t, store, tracker)
index, _ := store.LatestIndex()
index++
must.NoError(t, store.DeleteNode(structs.MsgTypeTestSetup, index, []string{n.ID}))
// Node with stopped drain should no longer be tracked
assertTrackerSettled(t, tracker, []string{})
must.MapEmpty(t, tracker.deadlineNotifier.(*MockDeadlineNotifier).nodes)
})
}
// TestNodeDrainWatcher_NoRemove tests that when the node status changes to
// down/disconnected that we don't remove it from the node watcher or deadline
// notifier
func TestNodeDrainWatcher_NoRemove(t *testing.T) {
ci.Parallel(t)
_, store, tracker := testNodeDrainWatcher(t)
n, _ := testNodeDrainWatcherSetup(t, store, tracker)
index, _ := store.LatestIndex()
n = n.Copy()
n.Status = structs.NodeStatusDisconnected
must.NoError(t, store.UpsertNode(structs.MsgTypeTestSetup, index+1, n))
assertTrackerSettled(t, tracker, []string{n.ID})
must.MapContainsKey(t, tracker.deadlineNotifier.(*MockDeadlineNotifier).nodes, n.ID)
index, _ = store.LatestIndex()
n = n.Copy()
n.Status = structs.NodeStatusDown
must.NoError(t, store.UpsertNode(structs.MsgTypeTestSetup, index+1, n))
assertTrackerSettled(t, tracker, []string{n.ID})
must.MapContainsKey(t, tracker.deadlineNotifier.(*MockDeadlineNotifier).nodes, n.ID)
}
// TestNodeDrainWatcher_Update_Spec tests drain spec updates emit events to the
// node watcher and deadline notifier.
func TestNodeDrainWatcher_Update_Spec(t *testing.T) {
ci.Parallel(t)
_, store, tracker := testNodeDrainWatcher(t)
n, _ := testNodeDrainWatcherSetup(t, store, tracker)
// Update the spec to extend the deadline
strategy := n.DrainStrategy.Copy()
strategy.DrainSpec.Deadline += time.Hour
index, _ := store.LatestIndex()
must.NoError(t, store.UpdateNodeDrain(
structs.MsgTypeTestSetup, index+1, n.ID, strategy, false, time.Now().Unix(),
&structs.NodeEvent{}, map[string]string{}, "",
))
// We should see a new event
assertTrackerSettled(t, tracker, []string{n.ID})
// Update the spec to have an infinite deadline
strategy = strategy.Copy()
strategy.DrainSpec.Deadline = 0
index, _ = store.LatestIndex()
must.NoError(t, store.UpdateNodeDrain(
structs.MsgTypeTestSetup, index+1, n.ID, strategy, false, time.Now().Unix(),
&structs.NodeEvent{}, map[string]string{}, "",
))
// We should see a new event and the node should still be tracked but no
// longer in the deadline notifier
assertTrackerSettled(t, tracker, []string{n.ID})
must.MapEmpty(t, tracker.deadlineNotifier.(*MockDeadlineNotifier).nodes)
}
// TestNodeDrainWatcher_Update_IsDone tests that a node drain without allocs
// immediately gets unmarked as draining, and that we unset drain if an operator
// drains a node with nothing on it.
func TestNodeDrainWatcher_Update_IsDone(t *testing.T) {
ci.Parallel(t)
_, store, tracker := testNodeDrainWatcher(t)
// Create a draining node
n := mock.Node()
strategy := &structs.DrainStrategy{
DrainSpec: structs.DrainSpec{Deadline: time.Hour},
ForceDeadline: time.Now().Add(time.Hour),
}
n.DrainStrategy = strategy
must.NoError(t, store.UpsertNode(structs.MsgTypeTestSetup, 100, n))
// There are no jobs on this node so the drain should immediately
// complete. we should no longer be tracking the node and its drain strategy
// should be cleared
assertTrackerSettled(t, tracker, []string{})
must.MapEmpty(t, tracker.jobWatcher.(*MockJobWatcher).jobs)
must.MapEmpty(t, tracker.deadlineNotifier.(*MockDeadlineNotifier).nodes)
n, _ = store.NodeByID(nil, n.ID)
must.Nil(t, n.DrainStrategy)
}
// TestNodeDrainWatcher_Update_DrainComplete tests that allocation updates that
// complete the drain emits events to the node watcher and deadline notifier.
func TestNodeDrainWatcher_Update_DrainComplete(t *testing.T) {
ci.Parallel(t)
_, store, tracker := testNodeDrainWatcher(t)
n, _ := testNodeDrainWatcherSetup(t, store, tracker)
// Simulate event: an alloc is terminal so DrainingJobWatcher.Migrated
// channel updates NodeDrainer, which updates Raft
_, err := tracker.raft.NodesDrainComplete([]string{n.ID},
structs.NewNodeEvent().
SetSubsystem(structs.NodeEventSubsystemDrain).
SetMessage(NodeDrainEventComplete))
must.NoError(t, err)
assertTrackerSettled(t, tracker, []string{})
n, _ = store.NodeByID(nil, n.ID)
must.Nil(t, n.DrainStrategy)
must.MapEmpty(t, tracker.deadlineNotifier.(*MockDeadlineNotifier).nodes)
}
func testNodeDrainWatcherSetup(
t *testing.T, store *state.StateStore, tracker *NodeDrainer) (
*structs.Node, structs.NamespacedID) {
t.Helper()
index, _ := store.LatestIndex()
// Create a job that will have an alloc on our node
job := mock.Job()
jobID := structs.NamespacedID{Namespace: job.Namespace, ID: job.ID}
index++
must.NoError(t, store.UpsertJob(structs.MsgTypeTestSetup, index, nil, job))
// Create draining nodes, each with its own alloc for the job running on that node
node := mock.Node()
node.DrainStrategy = &structs.DrainStrategy{
DrainSpec: structs.DrainSpec{Deadline: time.Hour},
ForceDeadline: time.Now().Add(time.Hour),
}
alloc := mock.Alloc()
alloc.JobID = job.ID
alloc.Job = job
alloc.TaskGroup = job.TaskGroups[0].Name
alloc.NodeID = node.ID
alloc.DeploymentStatus = &structs.AllocDeploymentStatus{Healthy: pointer.Of(true)}
index++
must.NoError(t, store.UpsertAllocs(
structs.MsgTypeTestSetup, index, []*structs.Allocation{alloc}))
index++
must.NoError(t, store.UpsertNode(structs.MsgTypeTestSetup, index, node))
// Node should be tracked and notifications should fire to the job watcher
// and deadline notifier
assertTrackerSettled(t, tracker, []string{node.ID})
must.MapContainsKey(t, tracker.jobWatcher.(*MockJobWatcher).jobs, jobID)
must.MapContainsKeys(t,
tracker.deadlineNotifier.(*MockDeadlineNotifier).nodes, []string{node.ID})
return node, jobID
}
func assertTrackerSettled(t *testing.T, tracker *NodeDrainer, nodeIDs []string) {
t.Helper()
must.Wait(t, wait.InitialSuccess(
wait.Timeout(100*time.Millisecond),
wait.Gap(time.Millisecond),
wait.TestFunc(func() (bool, error) {
if len(tracker.TrackedNodes()) != len(nodeIDs) {
return false, fmt.Errorf(
"expected nodes %v to become marked draining, got %d",
nodeIDs, len(tracker.TrackedNodes()))
}
return true, nil
}),
))
must.Wait(t, wait.ContinualSuccess(
wait.Timeout(100*time.Millisecond),
wait.Gap(10*time.Millisecond),
wait.TestFunc(func() (bool, error) {
if len(tracker.TrackedNodes()) != len(nodeIDs) {
return false, fmt.Errorf(
"expected nodes %v to stay marked draining, got %d",
nodeIDs, len(tracker.TrackedNodes()))
}
return true, nil
}),
))
for _, nodeID := range nodeIDs {
must.MapContainsKey(t, tracker.TrackedNodes(), nodeID)
}
}