Improve DeadlineTime helper

This commit is contained in:
Alex Dadgar 2018-02-28 16:25:56 -08:00 committed by Michael Schurter
parent b7c993f0e5
commit a37329189a
7 changed files with 109 additions and 54 deletions

View File

@ -176,6 +176,9 @@ type Node struct {
type DrainStrategy struct { type DrainStrategy struct {
// DrainSpec is the user declared drain specification // DrainSpec is the user declared drain specification
DrainSpec DrainSpec
// DeadlineTime is the deadline time for the drain.
DeadlineTime time.Time
} }
// DrainSpec describes a Node's drain behavior. // DrainSpec describes a Node's drain behavior.

View File

@ -21,6 +21,10 @@ func TestClient(t testing.T, cb func(c *config.Config)) *Client {
}, },
} }
// Loosen GC threshold
conf.GCDiskUsageThreshold = 98.0
conf.GCInodeUsageThreshold = 98.0
// Tighten the fingerprinter timeouts // Tighten the fingerprinter timeouts
if conf.Options == nil { if conf.Options == nil {
conf.Options = make(map[string]string) conf.Options = make(map[string]string)

View File

@ -157,24 +157,41 @@ func (n *NodeDrainer) Run() {
} }
} }
// getNextDeadline is a helper that takes a set of draining nodes and returns the
// next deadline. It also returns a boolean if there is a deadline.
func getNextDeadline(nodes map[string]*structs.Node) (time.Time, bool) {
var nextDeadline time.Time
found := false
for _, node := range nodes {
inf, d := node.DrainStrategy.DeadlineTime()
if !inf && (nextDeadline.IsZero() || d.Before(nextDeadline)) {
nextDeadline = d
found = true
}
}
return nextDeadline, found
}
// nodeDrainer is the core node draining main loop and should be started in a // nodeDrainer is the core node draining main loop and should be started in a
// goroutine when a server establishes leadership. // goroutine when a server establishes leadership.
func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore) { func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore) {
nodes, nodesIndex, drainingJobs, allocsIndex := initDrainer(n.logger, state) nodes, nodesIndex, drainingJobs, allocsIndex := initDrainer(n.logger, state)
// Wait for a node's drain deadline to expire // Wait for a node's drain deadline to expire
var nextDeadline time.Time nextDeadline, ok := getNextDeadline(nodes)
for _, node := range nodes {
if nextDeadline.IsZero() {
nextDeadline = node.DrainStrategy.DeadlineTime()
continue
}
if deadline := node.DrainStrategy.DeadlineTime(); deadline.Before(nextDeadline) {
nextDeadline = deadline
}
}
deadlineTimer := time.NewTimer(time.Until(nextDeadline)) deadlineTimer := time.NewTimer(time.Until(nextDeadline))
stopDeadlineTimer := func() {
if !deadlineTimer.Stop() {
select {
case <-deadlineTimer.C:
default:
}
}
}
if !ok {
stopDeadlineTimer()
}
// Watch for nodes to start or stop draining // Watch for nodes to start or stop draining
nodeWatcher := newNodeWatcher(n.logger, nodes, nodesIndex, state) nodeWatcher := newNodeWatcher(n.logger, nodes, nodesIndex, state)
@ -197,33 +214,14 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
// update draining nodes // update draining nodes
n.logger.Printf("[TRACE] nomad.drain: running due to node change (%d nodes draining)", len(nodes)) n.logger.Printf("[TRACE] nomad.drain: running due to node change (%d nodes draining)", len(nodes))
// update deadline timer d, ok := getNextDeadline(nodes)
changed := false if ok && !nextDeadline.Equal(d) {
for _, n := range nodes { nextDeadline = d
if nextDeadline.IsZero() {
nextDeadline = n.DrainStrategy.DeadlineTime()
changed = true
continue
}
if deadline := n.DrainStrategy.DeadlineTime(); deadline.Before(nextDeadline) {
nextDeadline = deadline
changed = true
}
}
// if changed reset the timer
if changed {
n.logger.Printf("[TRACE] nomad.drain: new node deadline: %s", nextDeadline) n.logger.Printf("[TRACE] nomad.drain: new node deadline: %s", nextDeadline)
if !deadlineTimer.Stop() { stopDeadlineTimer()
// timer may have been recv'd in a
// previous loop, so don't block
select {
case <-deadlineTimer.C:
default:
}
}
deadlineTimer.Reset(time.Until(nextDeadline)) deadlineTimer.Reset(time.Until(nextDeadline))
} else if !ok {
stopDeadlineTimer()
} }
case jobs := <-jobWatcher.WaitCh(): case jobs := <-jobWatcher.WaitCh():
@ -275,7 +273,8 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
// track number of allocs left on this node to be drained // track number of allocs left on this node to be drained
allocsLeft := false allocsLeft := false
deadlineReached := node.DrainStrategy.DeadlineTime().Before(now) inf, deadline := node.DrainStrategy.DeadlineTime()
deadlineReached := !inf && deadline.Before(now)
for _, alloc := range allocs { for _, alloc := range allocs {
jobkey := jobKey{alloc.Namespace, alloc.JobID} jobkey := jobKey{alloc.Namespace, alloc.JobID}
@ -307,8 +306,13 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
// Don't bother collecting system/batch jobs for nodes that haven't hit their deadline // Don't bother collecting system/batch jobs for nodes that haven't hit their deadline
if job.Type != structs.JobTypeService && !deadlineReached { if job.Type != structs.JobTypeService && !deadlineReached {
n.logger.Printf("[TRACE] nomad.drain: not draining %s job %s because deadline isn't for %s", if inf, d := node.DrainStrategy.DeadlineTime(); inf {
job.Type, job.Name, node.DrainStrategy.DeadlineTime().Sub(now)) n.logger.Printf("[TRACE] nomad.drain: not draining %s job %s because node has an infinite deadline",
job.Type, job.Name)
} else {
n.logger.Printf("[TRACE] nomad.drain: not draining %s job %s because deadline isn't for %s",
job.Type, job.Name, d.Sub(now))
}
skipJob[jobkey] = struct{}{} skipJob[jobkey] = struct{}{}
continue continue
} }
@ -370,7 +374,7 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
tgKey := makeTaskGroupKey(alloc) tgKey := makeTaskGroupKey(alloc)
if node.DrainStrategy.DeadlineTime().Before(now) { if inf, d := node.DrainStrategy.DeadlineTime(); !inf && d.Before(now) {
n.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to node's drain deadline", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6]) n.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to node's drain deadline", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
// Alloc's Node has reached its deadline // Alloc's Node has reached its deadline
stoplist.add(drainingJob.job, alloc) stoplist.add(drainingJob.job, alloc)
@ -494,7 +498,7 @@ func initDrainer(logger *log.Logger, state *state.StateStore) (map[string]*struc
nodes[node.ID] = node nodes[node.ID] = node
// No point in tracking draining allocs as the deadline has been reached // No point in tracking draining allocs as the deadline has been reached
if node.DrainStrategy.DeadlineTime().Before(now) { if inf, d := node.DrainStrategy.DeadlineTime(); !inf && d.Before(now) {
continue continue
} }

View File

@ -59,6 +59,7 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
serviceJob := mock.Job() serviceJob := mock.Job()
serviceJob.Name = "service-job" serviceJob.Name = "service-job"
serviceJob.Type = structs.JobTypeService serviceJob.Type = structs.JobTypeService
serviceJob.Constraints = nil
serviceJob.TaskGroups[0].Migrate = &structs.MigrateStrategy{ serviceJob.TaskGroups[0].Migrate = &structs.MigrateStrategy{
MaxParallel: 1, MaxParallel: 1,
HealthCheck: structs.MigrateStrategyHealthStates, HealthCheck: structs.MigrateStrategyHealthStates,
@ -76,6 +77,7 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
systemJob := mock.SystemJob() systemJob := mock.SystemJob()
systemJob.Name = "system-job" systemJob.Name = "system-job"
systemJob.Type = structs.JobTypeSystem systemJob.Type = structs.JobTypeSystem
systemJob.Constraints = nil
//FIXME hack until system job reschedule policy validation is fixed //FIXME hack until system job reschedule policy validation is fixed
systemJob.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 1, Interval: time.Minute} systemJob.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 1, Interval: time.Minute}
systemJob.TaskGroups[0].Tasks[0].Driver = "mock_driver" systemJob.TaskGroups[0].Tasks[0].Driver = "mock_driver"
@ -90,6 +92,7 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
batchJob := mock.Job() batchJob := mock.Job()
batchJob.Name = "batch-job" batchJob.Name = "batch-job"
batchJob.Type = structs.JobTypeBatch batchJob.Type = structs.JobTypeBatch
batchJob.Constraints = nil
batchJob.TaskGroups[0].Name = "batch-group" batchJob.TaskGroups[0].Name = "batch-group"
batchJob.TaskGroups[0].Migrate = nil batchJob.TaskGroups[0].Migrate = nil
batchJob.TaskGroups[0].Tasks[0].Name = "batch-task" batchJob.TaskGroups[0].Tasks[0].Name = "batch-task"
@ -159,6 +162,11 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
t.Logf("%d alloc %s job %s status %s", i, alloc.ID, alloc.Job.Name, alloc.ClientStatus) t.Logf("%d alloc %s job %s status %s", i, alloc.ID, alloc.Job.Name, alloc.ClientStatus)
} }
} }
if resp, err := rpc.EvalList(); err == nil {
for _, eval := range resp.Evaluations {
t.Logf("% #v\n", pretty.Formatter(eval))
}
}
t.Fatalf("failed waiting for all allocs to start: %v", err) t.Fatalf("failed waiting for all allocs to start: %v", err)
}) })

View File

@ -57,18 +57,18 @@ func (n *nodeWatcher) run(ctx context.Context) {
for _, newNode := range newNodes { for _, newNode := range newNodes {
if existingNode, ok := n.nodes[newNode.ID]; ok { if existingNode, ok := n.nodes[newNode.ID]; ok {
// Node was draining, see if it has changed // Node was draining, see if it has changed
if !newNode.Drain { if newNode.DrainStrategy == nil {
// Node stopped draining // Node stopped draining
delete(n.nodes, newNode.ID) delete(n.nodes, newNode.ID)
changed = true changed = true
} else if !newNode.DrainStrategy.DeadlineTime().Equal(existingNode.DrainStrategy.DeadlineTime()) { } else if !newNode.DrainStrategy.Equal(existingNode.DrainStrategy) {
// Update deadline // Update deadline
n.nodes[newNode.ID] = newNode n.nodes[newNode.ID] = newNode
changed = true changed = true
} }
} else { } else {
// Node was not draining // Node was not draining
if newNode.Drain { if newNode.DrainStrategy != nil {
// Node started draining // Node started draining
n.nodes[newNode.ID] = newNode n.nodes[newNode.ID] = newNode
changed = true changed = true

View File

@ -443,6 +443,11 @@ func (n *Node) UpdateDrain(args *structs.NodeUpdateDrainRequest,
} }
} }
// Mark the deadline time
if args.DrainStrategy != nil && args.DrainStrategy.Deadline.Nanoseconds() > 0 {
args.DrainStrategy.ForceDeadline = time.Now().Add(args.DrainStrategy.Deadline)
}
// Commit this update via Raft // Commit this update via Raft
_, index, err := n.srv.raftApply(structs.NodeUpdateDrainRequestType, args) _, index, err := n.srv.raftApply(structs.NodeUpdateDrainRequestType, args)
if err != nil { if err != nil {

View File

@ -1214,9 +1214,9 @@ type DrainStrategy struct {
// DrainSpec is the user declared drain specification // DrainSpec is the user declared drain specification
DrainSpec DrainSpec
// StartTime as nanoseconds since Unix epoch indicating when a drain // ForceDeadline is the deadline time for the drain after which drains will
// began for deadline calcuations. // be forced
StartTime int64 ForceDeadline time.Time
} }
func (d *DrainStrategy) Copy() *DrainStrategy { func (d *DrainStrategy) Copy() *DrainStrategy {
@ -1229,16 +1229,47 @@ func (d *DrainStrategy) Copy() *DrainStrategy {
return nd return nd
} }
// DeadlineTime returns the Time this drain's deadline will be reached or the // DeadlineTime returns a boolean whether the drain strategy allows an infinite
// zero value for Time if DrainStrategy is nil or Duration is <= 0. // duration or otherwise the deadline time. The force drain is captured by the
func (d *DrainStrategy) DeadlineTime() time.Time { // deadline time being in the past.
func (d *DrainStrategy) DeadlineTime() (infinite bool, deadline time.Time) {
// Treat the nil case as a force drain so during an upgrade where a node may
// not have a drain strategy but has Drain set to true, it is treated as a
// force to mimick old behavior.
if d == nil { if d == nil {
return time.Time{} return false, time.Time{}
} }
if d.Deadline <= 0 {
return time.Time{} ns := d.Deadline.Nanoseconds()
switch {
case ns < 0: // Force
return false, time.Time{}
case ns == 0: // Infinite
return true, time.Time{}
default:
return false, d.ForceDeadline
} }
return time.Unix(0, d.StartTime).Add(d.Deadline) }
func (d *DrainStrategy) Equal(o *DrainStrategy) bool {
if d == nil && o == nil {
return true
} else if o != nil && d == nil {
return false
} else if d != nil && o == nil {
return false
}
// Compare values
if d.ForceDeadline != o.ForceDeadline {
return false
} else if d.Deadline != o.Deadline {
return false
} else if d.IgnoreSystemJobs != o.IgnoreSystemJobs {
return false
}
return true
} }
// Node is a representation of a schedulable client node // Node is a representation of a schedulable client node