From 81b78f77e1931d14018be2c837d3f291ec0d0c11 Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Fri, 31 Mar 2017 15:57:10 -0700 Subject: [PATCH] Track task start/finish time & improve logs errors This PR adds tracking to when a task starts and finishes and the logs API takes advantage of this and returns better errors when asking for logs that do not exist. --- api/tasks.go | 8 +++++--- client/alloc_runner.go | 19 ++++++++++++++----- client/alloc_runner_test.go | 9 +++++++++ client/client.go | 23 +++++++++++++++++------ command/agent/fs_endpoint.go | 18 ++++++++++++++++++ nomad/structs/structs.go | 10 ++++++++++ website/source/docs/http/alloc.html.md | 6 ++++++ 7 files changed, 79 insertions(+), 14 deletions(-) diff --git a/api/tasks.go b/api/tasks.go index 0e8269896..fc490b37f 100644 --- a/api/tasks.go +++ b/api/tasks.go @@ -424,9 +424,11 @@ func (t *Task) SetLogConfig(l *LogConfig) *Task { // TaskState tracks the current state of a task and events that caused state // transitions. type TaskState struct { - State string - Failed bool - Events []*TaskEvent + State string + Failed bool + StartedAt time.Time + FinishedAt time.Time + Events []*TaskEvent } const ( diff --git a/client/alloc_runner.go b/client/alloc_runner.go index 43071f47b..bacebee20 100644 --- a/client/alloc_runner.go +++ b/client/alloc_runner.go @@ -381,8 +381,16 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv return } - taskState.State = state - if state == structs.TaskStateDead { + switch state { + case structs.TaskStateRunning: + // Capture the start time if it is just starting + if taskState.State != structs.TaskStateRunning { + taskState.StartedAt = time.Now().UTC() + } + case structs.TaskStateDead: + // Capture the finished time + taskState.FinishedAt = time.Now().UTC() + // Find all tasks that are not the one that is dead and check if the one // that is dead is a leader var otherTaskRunners []*TaskRunner @@ -406,6 +414,7 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv r.logger.Printf("[DEBUG] client: task %q failed, destroying other tasks in task group: %v", taskName, otherTaskNames) } } else if leader { + // If the task was a leader task we should kill all the other tasks. for _, tr := range otherTaskRunners { tr.Destroy(structs.NewTaskEvent(structs.TaskLeaderDead)) } @@ -413,11 +422,11 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv r.logger.Printf("[DEBUG] client: leader task %q is dead, destroying other tasks in task group: %v", taskName, otherTaskNames) } } - - // If the task was a leader task we should kill all the other - // tasks. } + // Store the new state + taskState.State = state + select { case r.dirtyCh <- struct{}{}: default: diff --git a/client/alloc_runner_test.go b/client/alloc_runner_test.go index 721a6e016..bd52786fb 100644 --- a/client/alloc_runner_test.go +++ b/client/alloc_runner_test.go @@ -633,6 +633,9 @@ func TestAllocRunner_TaskFailed_KillTG(t *testing.T) { if state1.State != structs.TaskStateDead { return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead) } + if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() { + return false, fmt.Errorf("expected to have a start and finish time") + } if len(state1.Events) < 2 { // At least have a received and destroyed return false, fmt.Errorf("Unexpected number of events") @@ -700,6 +703,9 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) { if state1.State != structs.TaskStateDead { return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead) } + if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() { + return false, fmt.Errorf("expected to have a start and finish time") + } if len(state1.Events) < 2 { // At least have a received and destroyed return false, fmt.Errorf("Unexpected number of events") @@ -721,6 +727,9 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) { if state2.State != structs.TaskStateDead { return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead) } + if state2.FinishedAt.IsZero() || state2.StartedAt.IsZero() { + return false, fmt.Errorf("expected to have a start and finish time") + } return true, nil }, func(err error) { diff --git a/client/client.go b/client/client.go index 1b3401f69..9894dd482 100644 --- a/client/client.go +++ b/client/client.go @@ -531,11 +531,21 @@ func (c *Client) GetAllocFS(allocID string) (allocdir.AllocDirFS, error) { ar, ok := c.allocs[allocID] if !ok { - return nil, fmt.Errorf("alloc not found") + return nil, fmt.Errorf("unknown allocation ID %q", allocID) } return ar.GetAllocDir(), nil } +// GetClientAlloc returns the allocation from the client +func (c *Client) GetClientAlloc(allocID string) (*structs.Allocation, error) { + all := c.allAllocs() + alloc, ok := all[allocID] + if !ok { + return nil, fmt.Errorf("unknown allocation ID %q", allocID) + } + return alloc, nil +} + // GetServers returns the list of nomad servers this client is aware of. func (c *Client) GetServers() []string { endpoints := c.servers.all() @@ -2349,20 +2359,21 @@ func (c *Client) getAllocatedResources(selfNode *structs.Node) *structs.Resource } // allAllocs returns all the allocations managed by the client -func (c *Client) allAllocs() []*structs.Allocation { - var allocs []*structs.Allocation +func (c *Client) allAllocs() map[string]*structs.Allocation { + allocs := make(map[string]*structs.Allocation, 16) for _, ar := range c.getAllocRunners() { - allocs = append(allocs, ar.Alloc()) + a := ar.Alloc() + allocs[a.ID] = a } c.blockedAllocsLock.Lock() for _, alloc := range c.blockedAllocations { - allocs = append(allocs, alloc) + allocs[alloc.ID] = alloc } c.blockedAllocsLock.Unlock() c.migratingAllocsLock.Lock() for _, ctrl := range c.migratingAllocs { - allocs = append(allocs, ctrl.alloc) + allocs[ctrl.alloc.ID] = ctrl.alloc } c.migratingAllocsLock.Unlock() return allocs diff --git a/command/agent/fs_endpoint.go b/command/agent/fs_endpoint.go index 5153da166..7658e6628 100644 --- a/command/agent/fs_endpoint.go +++ b/command/agent/fs_endpoint.go @@ -761,6 +761,24 @@ func (s *HTTPServer) Logs(resp http.ResponseWriter, req *http.Request) (interfac return nil, err } + alloc, err := s.agent.client.GetClientAlloc(allocID) + if err != nil { + return nil, err + } + + // Check that the task is there + tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) + if tg == nil { + return nil, fmt.Errorf("Failed to lookup task group for allocation") + } else if taskStruct := tg.LookupTask(task); taskStruct == nil { + return nil, CodedError(404, fmt.Sprintf("task group %q does not have task with name %q", alloc.TaskGroup, task)) + } + + state, ok := alloc.TaskStates[task] + if !ok || state.StartedAt.IsZero() { + return nil, CodedError(404, fmt.Sprintf("task %q not started yet. No logs available", task)) + } + // Create an output that gets flushed on every write output := ioutils.NewWriteFlusher(resp) diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index f85ae1d52..868767a1a 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -2774,6 +2774,14 @@ type TaskState struct { // Failed marks a task as having failed Failed bool + // StartedAt is the time the task is started. It is updated each time the + // task starts + StartedAt time.Time + + // FinishedAt is the time at which the task transistioned to dead and will + // not be started again. + FinishedAt time.Time + // Series of task events that transition the state of the task. Events []*TaskEvent } @@ -2785,6 +2793,8 @@ func (ts *TaskState) Copy() *TaskState { copy := new(TaskState) copy.State = ts.State copy.Failed = ts.Failed + copy.StartedAt = ts.StartedAt + copy.FinishedAt = ts.FinishedAt if ts.Events != nil { copy.Events = make([]*TaskEvent, len(ts.Events)) diff --git a/website/source/docs/http/alloc.html.md b/website/source/docs/http/alloc.html.md index 294fdbf19..0ad22b79c 100644 --- a/website/source/docs/http/alloc.html.md +++ b/website/source/docs/http/alloc.html.md @@ -225,6 +225,8 @@ be specified using the `?region=` query parameter. } ], "State": "running" + "FinishedAt": "0001-01-01T00:00:00Z", + "StartedAt": "2017-03-31T22:51:40.248633594Z", "Failed": false, } }, @@ -247,6 +249,10 @@ be specified using the `?region=` query parameter. * `TaskStateRunning` - The task is currently running. * `TaskStateDead` - The task is dead and will not run again. + Further the state contains the `StartedAt` and `FinishedAt` times of the + task. `StartedAt` can be updated multiple times if the task restarts but + `FinishedAt` is set only when the task transistions to `TaskStateDead` +

The latest 10 events are stored per task. Each event is timestamped (unix nano-seconds) and has one of the following types: