wip split event emitting and state transitions

This commit is contained in:
Michael Schurter 2018-07-16 16:32:37 -07:00
parent 516d641db0
commit 6ebdf532ea

View file

@ -43,12 +43,15 @@ var (
// taskRunnerStateAllKey holds all the task runners state. At the moment // taskRunnerStateAllKey holds all the task runners state. At the moment
// there is no need to split it // there is no need to split it
//XXX refactor out of clientstate and new state //XXX refactor out of clientstate and new state
taskRunnerStateAllKey = []byte("simple-all") //XXX Old key - going to need to migrate
//taskRunnerStateAllKey = []byte("simple-all")
taskLocalStateKey = []byte("local_state")
taskStateKey = []byte("task_state")
) )
type TaskRunner struct { type TaskRunner struct {
// allocID and taskName are immutable so store a copy to access without // allocID and taskName are immutable so these fields may be accessed
// locks // without locks
allocID string allocID string
taskName string taskName string
@ -58,7 +61,8 @@ type TaskRunner struct {
clientConfig *config.Config clientConfig *config.Config
// state captures the state of the task for updating the allocation // state captures the state of the task for updating the allocation
state *structs.TaskState state *structs.TaskState
stateLock sync.Mutex
// localState captures the node-local state of the task for when the // localState captures the node-local state of the task for when the
// Nomad agent restarts // Nomad agent restarts
@ -368,10 +372,11 @@ func (tr *TaskRunner) runDriver() error {
func (tr *TaskRunner) initDriver() error { func (tr *TaskRunner) initDriver() error {
// Create a task-specific event emitter callback to expose minimal // Create a task-specific event emitter callback to expose minimal
// state to drivers // state to drivers
//XXX Replace with EmitEvent -- no need for a shim
eventEmitter := func(m string, args ...interface{}) { eventEmitter := func(m string, args ...interface{}) {
msg := fmt.Sprintf(m, args...) msg := fmt.Sprintf(m, args...)
tr.logger.Debug("driver event", "event", msg) tr.logger.Debug("driver event", "event", msg)
tr.SetState("", structs.NewTaskEvent(structs.TaskDriverMessage).SetDriverMessage(msg)) tr.EmitEvent(structs.NewTaskEvent(structs.TaskDriverMessage).SetDriverMessage(msg))
} }
alloc := tr.Alloc() alloc := tr.Alloc()
@ -455,7 +460,7 @@ func (tr *TaskRunner) persistLocalState() error {
return fmt.Errorf("failed to retrieve allocation bucket: %v", err) return fmt.Errorf("failed to retrieve allocation bucket: %v", err)
} }
if err := clientstate.PutData(taskBkt, taskRunnerStateAllKey, buf.Bytes()); err != nil { if err := clientstate.PutData(taskBkt, taskLocalStateKey, buf.Bytes()); err != nil {
return fmt.Errorf("failed to write task_runner state: %v", err) return fmt.Errorf("failed to write task_runner state: %v", err)
} }
@ -469,60 +474,52 @@ func (tr *TaskRunner) persistLocalState() error {
} }
// Restore task runner state. Called by AllocRunner.Restore after NewTaskRunner // Restore task runner state. Called by AllocRunner.Restore after NewTaskRunner
// but before Run. // but before Run so no locks need to be acquired.
func (tr *TaskRunner) Restore(tx *bolt.Tx) error { func (tr *TaskRunner) Restore(tx *bolt.Tx) error {
bkt, err := clientstate.GetTaskBucket(tx, tr.allocID, tr.taskName) bkt, err := clientstate.GetTaskBucket(tx, tr.allocID, tr.taskName)
if err != nil { if err != nil {
return fmt.Errorf("failed to get task %q bucket: %v", tr.taskName, err) return fmt.Errorf("failed to get task %q bucket: %v", tr.taskName, err)
} }
// Restore Local State
//XXX set persisted hash to avoid immediate write on first use? //XXX set persisted hash to avoid immediate write on first use?
var ls state.LocalState var ls state.LocalState
if err := clientstate.GetObject(bkt, taskRunnerStateAllKey, &ls); err != nil { if err := clientstate.GetObject(bkt, taskLocalStateKey, &ls); err != nil {
return fmt.Errorf("failed to read task runner state: %v", err) return fmt.Errorf("failed to read local task runner state: %v", err)
} }
tr.localState = &ls tr.localState = &ls
// Restore Task State
var ts structs.TaskState
if err := clientstate.GetObject(bkt, taskStateKey, &ts); err != nil {
return fmt.Errorf("failed to read task state: %v", err)
}
tr.state = &tr
return nil return nil
} }
// SetState sets the task runners allocation state. // SetState sets the task runners allocation state.
func (tr *TaskRunner) SetState(state string, event *structs.TaskEvent) { func (tr *TaskRunner) SetState(state string, event *structs.TaskEvent) {
// Ensure the event is populated with human readable strings tr.stateLock.Lock()
event.PopulateEventDisplayMessage() defer tr.stateLock.Unlock()
task := tr.state taskState := tr.state
// Update the state of the task //XXX REMOVE ME AFTER TESTING
if state != "" { if state != "" {
task.State = state panic("SetState must not be called with an empty state")
} }
// Handle the event // Append the event
if event == nil { tr.emitEventImpl(tx, event)
if event.FailsTask {
task.Failed = true
}
if event.Type == structs.TaskRestarting { // Handle the state transition.
if !tr.clientConfig.DisableTaggedMetrics {
metrics.IncrCounterWithLabels([]string{"client", "allocs", "restart"}, 1, tr.baseLabels)
}
//if r.config.BackwardsCompatibleMetrics {
//metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "restart"}, 1)
//}
task.Restarts++
task.LastRestart = time.Unix(0, event.Time)
}
appendTaskEvent(task, event)
}
// Handle the state transistion.
switch state { switch state {
case structs.TaskStateRunning: case structs.TaskStateRunning:
// Capture the start time if it is just starting // Capture the start time if it is just starting
if task.State != structs.TaskStateRunning { if taskState.State != structs.TaskStateRunning {
task.StartedAt = time.Now().UTC() taskState.StartedAt = time.Now().UTC()
if !tr.clientConfig.DisableTaggedMetrics { if !tr.clientConfig.DisableTaggedMetrics {
metrics.IncrCounterWithLabels([]string{"client", "allocs", "running"}, 1, tr.baseLabels) metrics.IncrCounterWithLabels([]string{"client", "allocs", "running"}, 1, tr.baseLabels)
} }
@ -532,12 +529,12 @@ func (tr *TaskRunner) SetState(state string, event *structs.TaskEvent) {
} }
case structs.TaskStateDead: case structs.TaskStateDead:
// Capture the finished time if not already set // Capture the finished time if not already set
if task.FinishedAt.IsZero() { if taskState.FinishedAt.IsZero() {
task.FinishedAt = time.Now().UTC() taskState.FinishedAt = time.Now().UTC()
} }
// Emitting metrics to indicate task complete and failures // Emitting metrics to indicate task complete and failures
if task.Failed { if taskState.Failed {
if !tr.clientConfig.DisableTaggedMetrics { if !tr.clientConfig.DisableTaggedMetrics {
metrics.IncrCounterWithLabels([]string{"client", "allocs", "failed"}, 1, tr.baseLabels) metrics.IncrCounterWithLabels([]string{"client", "allocs", "failed"}, 1, tr.baseLabels)
} }
@ -554,15 +551,80 @@ func (tr *TaskRunner) SetState(state string, event *structs.TaskEvent) {
} }
} }
// Persist the state and event
err := tr.stateDB.Update(func(tx *bolt.Tx) error {
bkt, err := clientstate.GetTaskBucket(tx, tr.allocID, tr.taskName)
if err != nil {
return err
}
return bkt.PutObject(bkt, taskStateKey, tr.state)
})
if err != nil {
// Only a warning because the next event/state-transition will
// try to persist it again.
tr.logger.Error("error persisting task state", "error", err, "event", event, "state", state)
}
// Create a copy and notify the alloc runner of the transition // Create a copy and notify the alloc runner of the transition
//FIXME //FIXME <-------START HERE
//if err := tr.allocRunner.StateUpdated(tr.state.Copy()); err != nil { //if err := tr.allocRunner.StateUpdated(tr.state.Copy()); err != nil {
//tr.logger.Error("failed to save state", "error", err) //tr.logger.Error("failed to save state", "error", err)
//} //}
} }
// EmitEvent appends a new TaskEvent to this task's TaskState. The actual
// TaskState.State (pending, running, dead) is *not* updated. Use SetState to
// transition states.
// Events are persisted locally but errors are simply logged.
func (tr *TaskRunner) EmitEvent(event *structs.TaskEvent) { func (tr *TaskRunner) EmitEvent(event *structs.TaskEvent) {
tr.SetState("", event) tr.stateLock.Lock()
defer tr.stateLock.Unlock()
tr.emitEventImpl(event)
// Events that do *not* change task state can be batched.
//XXX Seems like this clamps the maximum transaction latency to 10ms.
err := tr.stateDB.Batch(func(tx *bolt.Tx) error {
bkt, err := clientstate.GetTaskBucket(tx, tr.allocID, tr.taskName)
if err != nil {
return err
}
return bkt.PutObject(bkt, taskStateKey, tr.state)
})
if err != nil {
// Only a warning because the next event/state-transition will
// try to persist it again.
tr.logger.Warn("error persisting event", "error", err, "event", event)
}
}
// emitEventImpl is the implementation of EmitEvent without the locking so it
// can be used from SetState.
func (tr *TaskRunner) emitEventImpl(event *structs.TaskEvent) error {
// Ensure the event is populated with human readable strings
event.PopulateEventDisplayMessage()
// Propogate failure from event to task state
if event.FailsTask {
tr.state.Failed = true
}
// Update restart metrics
if event.Type == structs.TaskRestarting {
if !tr.clientConfig.DisableTaggedMetrics {
metrics.IncrCounterWithLabels([]string{"client", "allocs", "restart"}, 1, tr.baseLabels)
}
//if r.config.BackwardsCompatibleMetrics {
//metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "restart"}, 1)
//}
tr.state.Restarts++
tr.state.LastRestart = time.Unix(0, event.Time)
}
// Append event to slice
appendTaskEvent(tr.state, event)
} }
// WaitCh is closed when TaskRunner.Run exits. // WaitCh is closed when TaskRunner.Run exits.
@ -586,9 +648,11 @@ func (tr *TaskRunner) Update(update *structs.Allocation) {
// appendTaskEvent updates the task status by appending the new event. // appendTaskEvent updates the task status by appending the new event.
func appendTaskEvent(state *structs.TaskState, event *structs.TaskEvent) { func appendTaskEvent(state *structs.TaskState, event *structs.TaskEvent) {
capacity := 10 const capacity = 10
if state.Events == nil { if state.Events == nil {
state.Events = make([]*structs.TaskEvent, 0, capacity) state.Events = make([]*structs.TaskEvent, 1, capacity)
state.Events[0] = event
return
} }
// If we hit capacity, then shift it. // If we hit capacity, then shift it.