Improve DeadlineTime helper

This commit is contained in:
Alex Dadgar 2018-02-28 16:25:56 -08:00 committed by Michael Schurter
parent b7c993f0e5
commit a37329189a
7 changed files with 109 additions and 54 deletions

View File

@ -176,6 +176,9 @@ type Node struct {
type DrainStrategy struct {
// DrainSpec is the user declared drain specification
DrainSpec
// DeadlineTime is the deadline time for the drain.
DeadlineTime time.Time
}
// DrainSpec describes a Node's drain behavior.

View File

@ -21,6 +21,10 @@ func TestClient(t testing.T, cb func(c *config.Config)) *Client {
},
}
// Loosen GC threshold
conf.GCDiskUsageThreshold = 98.0
conf.GCInodeUsageThreshold = 98.0
// Tighten the fingerprinter timeouts
if conf.Options == nil {
conf.Options = make(map[string]string)

View File

@ -157,24 +157,41 @@ func (n *NodeDrainer) Run() {
}
}
// getNextDeadline is a helper that takes a set of draining nodes and returns the
// next deadline. It also returns a boolean if there is a deadline.
func getNextDeadline(nodes map[string]*structs.Node) (time.Time, bool) {
var nextDeadline time.Time
found := false
for _, node := range nodes {
inf, d := node.DrainStrategy.DeadlineTime()
if !inf && (nextDeadline.IsZero() || d.Before(nextDeadline)) {
nextDeadline = d
found = true
}
}
return nextDeadline, found
}
// nodeDrainer is the core node draining main loop and should be started in a
// goroutine when a server establishes leadership.
func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore) {
nodes, nodesIndex, drainingJobs, allocsIndex := initDrainer(n.logger, state)
// Wait for a node's drain deadline to expire
var nextDeadline time.Time
for _, node := range nodes {
if nextDeadline.IsZero() {
nextDeadline = node.DrainStrategy.DeadlineTime()
continue
}
if deadline := node.DrainStrategy.DeadlineTime(); deadline.Before(nextDeadline) {
nextDeadline = deadline
}
}
nextDeadline, ok := getNextDeadline(nodes)
deadlineTimer := time.NewTimer(time.Until(nextDeadline))
stopDeadlineTimer := func() {
if !deadlineTimer.Stop() {
select {
case <-deadlineTimer.C:
default:
}
}
}
if !ok {
stopDeadlineTimer()
}
// Watch for nodes to start or stop draining
nodeWatcher := newNodeWatcher(n.logger, nodes, nodesIndex, state)
@ -197,33 +214,14 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
// update draining nodes
n.logger.Printf("[TRACE] nomad.drain: running due to node change (%d nodes draining)", len(nodes))
// update deadline timer
changed := false
for _, n := range nodes {
if nextDeadline.IsZero() {
nextDeadline = n.DrainStrategy.DeadlineTime()
changed = true
continue
}
if deadline := n.DrainStrategy.DeadlineTime(); deadline.Before(nextDeadline) {
nextDeadline = deadline
changed = true
}
}
// if changed reset the timer
if changed {
d, ok := getNextDeadline(nodes)
if ok && !nextDeadline.Equal(d) {
nextDeadline = d
n.logger.Printf("[TRACE] nomad.drain: new node deadline: %s", nextDeadline)
if !deadlineTimer.Stop() {
// timer may have been recv'd in a
// previous loop, so don't block
select {
case <-deadlineTimer.C:
default:
}
}
stopDeadlineTimer()
deadlineTimer.Reset(time.Until(nextDeadline))
} else if !ok {
stopDeadlineTimer()
}
case jobs := <-jobWatcher.WaitCh():
@ -275,7 +273,8 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
// track number of allocs left on this node to be drained
allocsLeft := false
deadlineReached := node.DrainStrategy.DeadlineTime().Before(now)
inf, deadline := node.DrainStrategy.DeadlineTime()
deadlineReached := !inf && deadline.Before(now)
for _, alloc := range allocs {
jobkey := jobKey{alloc.Namespace, alloc.JobID}
@ -307,8 +306,13 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
// Don't bother collecting system/batch jobs for nodes that haven't hit their deadline
if job.Type != structs.JobTypeService && !deadlineReached {
n.logger.Printf("[TRACE] nomad.drain: not draining %s job %s because deadline isn't for %s",
job.Type, job.Name, node.DrainStrategy.DeadlineTime().Sub(now))
if inf, d := node.DrainStrategy.DeadlineTime(); inf {
n.logger.Printf("[TRACE] nomad.drain: not draining %s job %s because node has an infinite deadline",
job.Type, job.Name)
} else {
n.logger.Printf("[TRACE] nomad.drain: not draining %s job %s because deadline isn't for %s",
job.Type, job.Name, d.Sub(now))
}
skipJob[jobkey] = struct{}{}
continue
}
@ -370,7 +374,7 @@ func (n *NodeDrainer) nodeDrainer(ctx context.Context, state *state.StateStore)
tgKey := makeTaskGroupKey(alloc)
if node.DrainStrategy.DeadlineTime().Before(now) {
if inf, d := node.DrainStrategy.DeadlineTime(); !inf && d.Before(now) {
n.logger.Printf("[TRACE] nomad.drain: draining job %s alloc %s from node %s due to node's drain deadline", drainingJob.job.Name, alloc.ID[:6], alloc.NodeID[:6])
// Alloc's Node has reached its deadline
stoplist.add(drainingJob.job, alloc)
@ -494,7 +498,7 @@ func initDrainer(logger *log.Logger, state *state.StateStore) (map[string]*struc
nodes[node.ID] = node
// No point in tracking draining allocs as the deadline has been reached
if node.DrainStrategy.DeadlineTime().Before(now) {
if inf, d := node.DrainStrategy.DeadlineTime(); !inf && d.Before(now) {
continue
}

View File

@ -59,6 +59,7 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
serviceJob := mock.Job()
serviceJob.Name = "service-job"
serviceJob.Type = structs.JobTypeService
serviceJob.Constraints = nil
serviceJob.TaskGroups[0].Migrate = &structs.MigrateStrategy{
MaxParallel: 1,
HealthCheck: structs.MigrateStrategyHealthStates,
@ -76,6 +77,7 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
systemJob := mock.SystemJob()
systemJob.Name = "system-job"
systemJob.Type = structs.JobTypeSystem
systemJob.Constraints = nil
//FIXME hack until system job reschedule policy validation is fixed
systemJob.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 1, Interval: time.Minute}
systemJob.TaskGroups[0].Tasks[0].Driver = "mock_driver"
@ -90,6 +92,7 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
batchJob := mock.Job()
batchJob.Name = "batch-job"
batchJob.Type = structs.JobTypeBatch
batchJob.Constraints = nil
batchJob.TaskGroups[0].Name = "batch-group"
batchJob.TaskGroups[0].Migrate = nil
batchJob.TaskGroups[0].Tasks[0].Name = "batch-task"
@ -159,6 +162,11 @@ func TestNodeDrainer_SimpleDrain(t *testing.T) {
t.Logf("%d alloc %s job %s status %s", i, alloc.ID, alloc.Job.Name, alloc.ClientStatus)
}
}
if resp, err := rpc.EvalList(); err == nil {
for _, eval := range resp.Evaluations {
t.Logf("% #v\n", pretty.Formatter(eval))
}
}
t.Fatalf("failed waiting for all allocs to start: %v", err)
})

View File

@ -57,18 +57,18 @@ func (n *nodeWatcher) run(ctx context.Context) {
for _, newNode := range newNodes {
if existingNode, ok := n.nodes[newNode.ID]; ok {
// Node was draining, see if it has changed
if !newNode.Drain {
if newNode.DrainStrategy == nil {
// Node stopped draining
delete(n.nodes, newNode.ID)
changed = true
} else if !newNode.DrainStrategy.DeadlineTime().Equal(existingNode.DrainStrategy.DeadlineTime()) {
} else if !newNode.DrainStrategy.Equal(existingNode.DrainStrategy) {
// Update deadline
n.nodes[newNode.ID] = newNode
changed = true
}
} else {
// Node was not draining
if newNode.Drain {
if newNode.DrainStrategy != nil {
// Node started draining
n.nodes[newNode.ID] = newNode
changed = true

View File

@ -443,6 +443,11 @@ func (n *Node) UpdateDrain(args *structs.NodeUpdateDrainRequest,
}
}
// Mark the deadline time
if args.DrainStrategy != nil && args.DrainStrategy.Deadline.Nanoseconds() > 0 {
args.DrainStrategy.ForceDeadline = time.Now().Add(args.DrainStrategy.Deadline)
}
// Commit this update via Raft
_, index, err := n.srv.raftApply(structs.NodeUpdateDrainRequestType, args)
if err != nil {

View File

@ -1214,9 +1214,9 @@ type DrainStrategy struct {
// DrainSpec is the user declared drain specification
DrainSpec
// StartTime as nanoseconds since Unix epoch indicating when a drain
// began for deadline calcuations.
StartTime int64
// ForceDeadline is the deadline time for the drain after which drains will
// be forced
ForceDeadline time.Time
}
func (d *DrainStrategy) Copy() *DrainStrategy {
@ -1229,16 +1229,47 @@ func (d *DrainStrategy) Copy() *DrainStrategy {
return nd
}
// DeadlineTime returns the Time this drain's deadline will be reached or the
// zero value for Time if DrainStrategy is nil or Duration is <= 0.
func (d *DrainStrategy) DeadlineTime() time.Time {
// DeadlineTime returns a boolean whether the drain strategy allows an infinite
// duration or otherwise the deadline time. The force drain is captured by the
// deadline time being in the past.
func (d *DrainStrategy) DeadlineTime() (infinite bool, deadline time.Time) {
// Treat the nil case as a force drain so during an upgrade where a node may
// not have a drain strategy but has Drain set to true, it is treated as a
// force to mimick old behavior.
if d == nil {
return time.Time{}
return false, time.Time{}
}
if d.Deadline <= 0 {
return time.Time{}
ns := d.Deadline.Nanoseconds()
switch {
case ns < 0: // Force
return false, time.Time{}
case ns == 0: // Infinite
return true, time.Time{}
default:
return false, d.ForceDeadline
}
return time.Unix(0, d.StartTime).Add(d.Deadline)
}
func (d *DrainStrategy) Equal(o *DrainStrategy) bool {
if d == nil && o == nil {
return true
} else if o != nil && d == nil {
return false
} else if d != nil && o == nil {
return false
}
// Compare values
if d.ForceDeadline != o.ForceDeadline {
return false
} else if d.Deadline != o.Deadline {
return false
} else if d.IgnoreSystemJobs != o.IgnoreSystemJobs {
return false
}
return true
}
// Node is a representation of a schedulable client node