Track task start/finish time & improve logs errors

This PR adds tracking to when a task starts and finishes and the logs
API takes advantage of this and returns better errors when asking for
logs that do not exist.
This commit is contained in:
Alex Dadgar 2017-03-31 15:57:10 -07:00
parent bb03a42ab6
commit 81b78f77e1
7 changed files with 79 additions and 14 deletions

View file

@ -424,9 +424,11 @@ func (t *Task) SetLogConfig(l *LogConfig) *Task {
// TaskState tracks the current state of a task and events that caused state
// transitions.
type TaskState struct {
State string
Failed bool
Events []*TaskEvent
State string
Failed bool
StartedAt time.Time
FinishedAt time.Time
Events []*TaskEvent
}
const (

View file

@ -381,8 +381,16 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
return
}
taskState.State = state
if state == structs.TaskStateDead {
switch state {
case structs.TaskStateRunning:
// Capture the start time if it is just starting
if taskState.State != structs.TaskStateRunning {
taskState.StartedAt = time.Now().UTC()
}
case structs.TaskStateDead:
// Capture the finished time
taskState.FinishedAt = time.Now().UTC()
// Find all tasks that are not the one that is dead and check if the one
// that is dead is a leader
var otherTaskRunners []*TaskRunner
@ -406,6 +414,7 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
r.logger.Printf("[DEBUG] client: task %q failed, destroying other tasks in task group: %v", taskName, otherTaskNames)
}
} else if leader {
// If the task was a leader task we should kill all the other tasks.
for _, tr := range otherTaskRunners {
tr.Destroy(structs.NewTaskEvent(structs.TaskLeaderDead))
}
@ -413,11 +422,11 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
r.logger.Printf("[DEBUG] client: leader task %q is dead, destroying other tasks in task group: %v", taskName, otherTaskNames)
}
}
// If the task was a leader task we should kill all the other
// tasks.
}
// Store the new state
taskState.State = state
select {
case r.dirtyCh <- struct{}{}:
default:

View file

@ -633,6 +633,9 @@ func TestAllocRunner_TaskFailed_KillTG(t *testing.T) {
if state1.State != structs.TaskStateDead {
return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
}
if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() {
return false, fmt.Errorf("expected to have a start and finish time")
}
if len(state1.Events) < 2 {
// At least have a received and destroyed
return false, fmt.Errorf("Unexpected number of events")
@ -700,6 +703,9 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {
if state1.State != structs.TaskStateDead {
return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
}
if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() {
return false, fmt.Errorf("expected to have a start and finish time")
}
if len(state1.Events) < 2 {
// At least have a received and destroyed
return false, fmt.Errorf("Unexpected number of events")
@ -721,6 +727,9 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {
if state2.State != structs.TaskStateDead {
return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead)
}
if state2.FinishedAt.IsZero() || state2.StartedAt.IsZero() {
return false, fmt.Errorf("expected to have a start and finish time")
}
return true, nil
}, func(err error) {

View file

@ -531,11 +531,21 @@ func (c *Client) GetAllocFS(allocID string) (allocdir.AllocDirFS, error) {
ar, ok := c.allocs[allocID]
if !ok {
return nil, fmt.Errorf("alloc not found")
return nil, fmt.Errorf("unknown allocation ID %q", allocID)
}
return ar.GetAllocDir(), nil
}
// GetClientAlloc returns the allocation from the client
func (c *Client) GetClientAlloc(allocID string) (*structs.Allocation, error) {
all := c.allAllocs()
alloc, ok := all[allocID]
if !ok {
return nil, fmt.Errorf("unknown allocation ID %q", allocID)
}
return alloc, nil
}
// GetServers returns the list of nomad servers this client is aware of.
func (c *Client) GetServers() []string {
endpoints := c.servers.all()
@ -2349,20 +2359,21 @@ func (c *Client) getAllocatedResources(selfNode *structs.Node) *structs.Resource
}
// allAllocs returns all the allocations managed by the client
func (c *Client) allAllocs() []*structs.Allocation {
var allocs []*structs.Allocation
func (c *Client) allAllocs() map[string]*structs.Allocation {
allocs := make(map[string]*structs.Allocation, 16)
for _, ar := range c.getAllocRunners() {
allocs = append(allocs, ar.Alloc())
a := ar.Alloc()
allocs[a.ID] = a
}
c.blockedAllocsLock.Lock()
for _, alloc := range c.blockedAllocations {
allocs = append(allocs, alloc)
allocs[alloc.ID] = alloc
}
c.blockedAllocsLock.Unlock()
c.migratingAllocsLock.Lock()
for _, ctrl := range c.migratingAllocs {
allocs = append(allocs, ctrl.alloc)
allocs[ctrl.alloc.ID] = ctrl.alloc
}
c.migratingAllocsLock.Unlock()
return allocs

View file

@ -761,6 +761,24 @@ func (s *HTTPServer) Logs(resp http.ResponseWriter, req *http.Request) (interfac
return nil, err
}
alloc, err := s.agent.client.GetClientAlloc(allocID)
if err != nil {
return nil, err
}
// Check that the task is there
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
if tg == nil {
return nil, fmt.Errorf("Failed to lookup task group for allocation")
} else if taskStruct := tg.LookupTask(task); taskStruct == nil {
return nil, CodedError(404, fmt.Sprintf("task group %q does not have task with name %q", alloc.TaskGroup, task))
}
state, ok := alloc.TaskStates[task]
if !ok || state.StartedAt.IsZero() {
return nil, CodedError(404, fmt.Sprintf("task %q not started yet. No logs available", task))
}
// Create an output that gets flushed on every write
output := ioutils.NewWriteFlusher(resp)

View file

@ -2774,6 +2774,14 @@ type TaskState struct {
// Failed marks a task as having failed
Failed bool
// StartedAt is the time the task is started. It is updated each time the
// task starts
StartedAt time.Time
// FinishedAt is the time at which the task transistioned to dead and will
// not be started again.
FinishedAt time.Time
// Series of task events that transition the state of the task.
Events []*TaskEvent
}
@ -2785,6 +2793,8 @@ func (ts *TaskState) Copy() *TaskState {
copy := new(TaskState)
copy.State = ts.State
copy.Failed = ts.Failed
copy.StartedAt = ts.StartedAt
copy.FinishedAt = ts.FinishedAt
if ts.Events != nil {
copy.Events = make([]*TaskEvent, len(ts.Events))

View file

@ -225,6 +225,8 @@ be specified using the `?region=` query parameter.
}
],
"State": "running"
"FinishedAt": "0001-01-01T00:00:00Z",
"StartedAt": "2017-03-31T22:51:40.248633594Z",
"Failed": false,
}
},
@ -247,6 +249,10 @@ be specified using the `?region=` query parameter.
* `TaskStateRunning` - The task is currently running.
* `TaskStateDead` - The task is dead and will not run again.
Further the state contains the `StartedAt` and `FinishedAt` times of the
task. `StartedAt` can be updated multiple times if the task restarts but
`FinishedAt` is set only when the task transistions to `TaskStateDead`
<p>The latest 10 events are stored per task. Each event is timestamped (unix nano-seconds)
and has one of the following types:</p>