Track task start/finish time & improve logs errors
This PR adds tracking to when a task starts and finishes and the logs API takes advantage of this and returns better errors when asking for logs that do not exist.
This commit is contained in:
parent
bb03a42ab6
commit
81b78f77e1
|
@ -424,9 +424,11 @@ func (t *Task) SetLogConfig(l *LogConfig) *Task {
|
|||
// TaskState tracks the current state of a task and events that caused state
|
||||
// transitions.
|
||||
type TaskState struct {
|
||||
State string
|
||||
Failed bool
|
||||
Events []*TaskEvent
|
||||
State string
|
||||
Failed bool
|
||||
StartedAt time.Time
|
||||
FinishedAt time.Time
|
||||
Events []*TaskEvent
|
||||
}
|
||||
|
||||
const (
|
||||
|
|
|
@ -381,8 +381,16 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
|
|||
return
|
||||
}
|
||||
|
||||
taskState.State = state
|
||||
if state == structs.TaskStateDead {
|
||||
switch state {
|
||||
case structs.TaskStateRunning:
|
||||
// Capture the start time if it is just starting
|
||||
if taskState.State != structs.TaskStateRunning {
|
||||
taskState.StartedAt = time.Now().UTC()
|
||||
}
|
||||
case structs.TaskStateDead:
|
||||
// Capture the finished time
|
||||
taskState.FinishedAt = time.Now().UTC()
|
||||
|
||||
// Find all tasks that are not the one that is dead and check if the one
|
||||
// that is dead is a leader
|
||||
var otherTaskRunners []*TaskRunner
|
||||
|
@ -406,6 +414,7 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
|
|||
r.logger.Printf("[DEBUG] client: task %q failed, destroying other tasks in task group: %v", taskName, otherTaskNames)
|
||||
}
|
||||
} else if leader {
|
||||
// If the task was a leader task we should kill all the other tasks.
|
||||
for _, tr := range otherTaskRunners {
|
||||
tr.Destroy(structs.NewTaskEvent(structs.TaskLeaderDead))
|
||||
}
|
||||
|
@ -413,11 +422,11 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
|
|||
r.logger.Printf("[DEBUG] client: leader task %q is dead, destroying other tasks in task group: %v", taskName, otherTaskNames)
|
||||
}
|
||||
}
|
||||
|
||||
// If the task was a leader task we should kill all the other
|
||||
// tasks.
|
||||
}
|
||||
|
||||
// Store the new state
|
||||
taskState.State = state
|
||||
|
||||
select {
|
||||
case r.dirtyCh <- struct{}{}:
|
||||
default:
|
||||
|
|
|
@ -633,6 +633,9 @@ func TestAllocRunner_TaskFailed_KillTG(t *testing.T) {
|
|||
if state1.State != structs.TaskStateDead {
|
||||
return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
|
||||
}
|
||||
if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() {
|
||||
return false, fmt.Errorf("expected to have a start and finish time")
|
||||
}
|
||||
if len(state1.Events) < 2 {
|
||||
// At least have a received and destroyed
|
||||
return false, fmt.Errorf("Unexpected number of events")
|
||||
|
@ -700,6 +703,9 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {
|
|||
if state1.State != structs.TaskStateDead {
|
||||
return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
|
||||
}
|
||||
if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() {
|
||||
return false, fmt.Errorf("expected to have a start and finish time")
|
||||
}
|
||||
if len(state1.Events) < 2 {
|
||||
// At least have a received and destroyed
|
||||
return false, fmt.Errorf("Unexpected number of events")
|
||||
|
@ -721,6 +727,9 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {
|
|||
if state2.State != structs.TaskStateDead {
|
||||
return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead)
|
||||
}
|
||||
if state2.FinishedAt.IsZero() || state2.StartedAt.IsZero() {
|
||||
return false, fmt.Errorf("expected to have a start and finish time")
|
||||
}
|
||||
|
||||
return true, nil
|
||||
}, func(err error) {
|
||||
|
|
|
@ -531,11 +531,21 @@ func (c *Client) GetAllocFS(allocID string) (allocdir.AllocDirFS, error) {
|
|||
|
||||
ar, ok := c.allocs[allocID]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("alloc not found")
|
||||
return nil, fmt.Errorf("unknown allocation ID %q", allocID)
|
||||
}
|
||||
return ar.GetAllocDir(), nil
|
||||
}
|
||||
|
||||
// GetClientAlloc returns the allocation from the client
|
||||
func (c *Client) GetClientAlloc(allocID string) (*structs.Allocation, error) {
|
||||
all := c.allAllocs()
|
||||
alloc, ok := all[allocID]
|
||||
if !ok {
|
||||
return nil, fmt.Errorf("unknown allocation ID %q", allocID)
|
||||
}
|
||||
return alloc, nil
|
||||
}
|
||||
|
||||
// GetServers returns the list of nomad servers this client is aware of.
|
||||
func (c *Client) GetServers() []string {
|
||||
endpoints := c.servers.all()
|
||||
|
@ -2349,20 +2359,21 @@ func (c *Client) getAllocatedResources(selfNode *structs.Node) *structs.Resource
|
|||
}
|
||||
|
||||
// allAllocs returns all the allocations managed by the client
|
||||
func (c *Client) allAllocs() []*structs.Allocation {
|
||||
var allocs []*structs.Allocation
|
||||
func (c *Client) allAllocs() map[string]*structs.Allocation {
|
||||
allocs := make(map[string]*structs.Allocation, 16)
|
||||
for _, ar := range c.getAllocRunners() {
|
||||
allocs = append(allocs, ar.Alloc())
|
||||
a := ar.Alloc()
|
||||
allocs[a.ID] = a
|
||||
}
|
||||
c.blockedAllocsLock.Lock()
|
||||
for _, alloc := range c.blockedAllocations {
|
||||
allocs = append(allocs, alloc)
|
||||
allocs[alloc.ID] = alloc
|
||||
}
|
||||
c.blockedAllocsLock.Unlock()
|
||||
|
||||
c.migratingAllocsLock.Lock()
|
||||
for _, ctrl := range c.migratingAllocs {
|
||||
allocs = append(allocs, ctrl.alloc)
|
||||
allocs[ctrl.alloc.ID] = ctrl.alloc
|
||||
}
|
||||
c.migratingAllocsLock.Unlock()
|
||||
return allocs
|
||||
|
|
|
@ -761,6 +761,24 @@ func (s *HTTPServer) Logs(resp http.ResponseWriter, req *http.Request) (interfac
|
|||
return nil, err
|
||||
}
|
||||
|
||||
alloc, err := s.agent.client.GetClientAlloc(allocID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Check that the task is there
|
||||
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
|
||||
if tg == nil {
|
||||
return nil, fmt.Errorf("Failed to lookup task group for allocation")
|
||||
} else if taskStruct := tg.LookupTask(task); taskStruct == nil {
|
||||
return nil, CodedError(404, fmt.Sprintf("task group %q does not have task with name %q", alloc.TaskGroup, task))
|
||||
}
|
||||
|
||||
state, ok := alloc.TaskStates[task]
|
||||
if !ok || state.StartedAt.IsZero() {
|
||||
return nil, CodedError(404, fmt.Sprintf("task %q not started yet. No logs available", task))
|
||||
}
|
||||
|
||||
// Create an output that gets flushed on every write
|
||||
output := ioutils.NewWriteFlusher(resp)
|
||||
|
||||
|
|
|
@ -2774,6 +2774,14 @@ type TaskState struct {
|
|||
// Failed marks a task as having failed
|
||||
Failed bool
|
||||
|
||||
// StartedAt is the time the task is started. It is updated each time the
|
||||
// task starts
|
||||
StartedAt time.Time
|
||||
|
||||
// FinishedAt is the time at which the task transistioned to dead and will
|
||||
// not be started again.
|
||||
FinishedAt time.Time
|
||||
|
||||
// Series of task events that transition the state of the task.
|
||||
Events []*TaskEvent
|
||||
}
|
||||
|
@ -2785,6 +2793,8 @@ func (ts *TaskState) Copy() *TaskState {
|
|||
copy := new(TaskState)
|
||||
copy.State = ts.State
|
||||
copy.Failed = ts.Failed
|
||||
copy.StartedAt = ts.StartedAt
|
||||
copy.FinishedAt = ts.FinishedAt
|
||||
|
||||
if ts.Events != nil {
|
||||
copy.Events = make([]*TaskEvent, len(ts.Events))
|
||||
|
|
|
@ -225,6 +225,8 @@ be specified using the `?region=` query parameter.
|
|||
}
|
||||
],
|
||||
"State": "running"
|
||||
"FinishedAt": "0001-01-01T00:00:00Z",
|
||||
"StartedAt": "2017-03-31T22:51:40.248633594Z",
|
||||
"Failed": false,
|
||||
}
|
||||
},
|
||||
|
@ -247,6 +249,10 @@ be specified using the `?region=` query parameter.
|
|||
* `TaskStateRunning` - The task is currently running.
|
||||
* `TaskStateDead` - The task is dead and will not run again.
|
||||
|
||||
Further the state contains the `StartedAt` and `FinishedAt` times of the
|
||||
task. `StartedAt` can be updated multiple times if the task restarts but
|
||||
`FinishedAt` is set only when the task transistions to `TaskStateDead`
|
||||
|
||||
<p>The latest 10 events are stored per task. Each event is timestamped (unix nano-seconds)
|
||||
and has one of the following types:</p>
|
||||
|
||||
|
|
Loading…
Reference in a new issue