Track task start/finish time & improve logs errors

This PR adds tracking to when a task starts and finishes and the logs API takes advantage of this and returns better errors when asking for logs that do not exist.
2017-03-31 15:57:10 -07:00 · 2017-03-31 15:57:10 -07:00 · 81b78f77e1
parent bb03a42ab6
commit 81b78f77e1
7 changed files with 79 additions and 14 deletions
--- a/api/tasks.go
+++ b/api/tasks.go
@ -424,9 +424,11 @@ func (t *Task) SetLogConfig(l *LogConfig) *Task {
 // TaskState tracks the current state of a task and events that caused state
 // transitions.
 type TaskState struct {
-	State  string
-	Failed bool
-	Events []*TaskEvent
+	State      string
+	Failed     bool
+	StartedAt  time.Time
+	FinishedAt time.Time
+	Events     []*TaskEvent
 }

 const (
--- a/client/alloc_runner.go
+++ b/client/alloc_runner.go
@ -381,8 +381,16 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
 		return
 	}

-	taskState.State = state
-	if state == structs.TaskStateDead {
+	switch state {
+	case structs.TaskStateRunning:
+		// Capture the start time if it is just starting
+		if taskState.State != structs.TaskStateRunning {
+			taskState.StartedAt = time.Now().UTC()
+		}
+	case structs.TaskStateDead:
+		// Capture the finished time
+		taskState.FinishedAt = time.Now().UTC()
+
 		// Find all tasks that are not the one that is dead and check if the one
 		// that is dead is a leader
 		var otherTaskRunners []*TaskRunner
@ -406,6 +414,7 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
 				r.logger.Printf("[DEBUG] client: task %q failed, destroying other tasks in task group: %v", taskName, otherTaskNames)
 			}
 		} else if leader {
+			// If the task was a leader task we should kill all the other tasks.
 			for _, tr := range otherTaskRunners {
 				tr.Destroy(structs.NewTaskEvent(structs.TaskLeaderDead))
 			}
@ -413,11 +422,11 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
 				r.logger.Printf("[DEBUG] client: leader task %q is dead, destroying other tasks in task group: %v", taskName, otherTaskNames)
 			}
 		}
-
-		// If the task was a leader task we should kill all the other
-		// tasks.
 	}

+	// Store the new state
+	taskState.State = state
+
 	select {
 	case r.dirtyCh <- struct{}{}:
 	default:
--- a/client/alloc_runner_test.go
+++ b/client/alloc_runner_test.go
@ -633,6 +633,9 @@ func TestAllocRunner_TaskFailed_KillTG(t *testing.T) {
 		if state1.State != structs.TaskStateDead {
 			return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
 		}
+		if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() {
+			return false, fmt.Errorf("expected to have a start and finish time")
+		}
 		if len(state1.Events) < 2 {
 			// At least have a received and destroyed
 			return false, fmt.Errorf("Unexpected number of events")
@ -700,6 +703,9 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {
 		if state1.State != structs.TaskStateDead {
 			return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
 		}
+		if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() {
+			return false, fmt.Errorf("expected to have a start and finish time")
+		}
 		if len(state1.Events) < 2 {
 			// At least have a received and destroyed
 			return false, fmt.Errorf("Unexpected number of events")
@ -721,6 +727,9 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {
 		if state2.State != structs.TaskStateDead {
 			return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead)
 		}
+		if state2.FinishedAt.IsZero() || state2.StartedAt.IsZero() {
+			return false, fmt.Errorf("expected to have a start and finish time")
+		}

 		return true, nil
 	}, func(err error) {
--- a/client/client.go
+++ b/client/client.go
@ -531,11 +531,21 @@ func (c *Client) GetAllocFS(allocID string) (allocdir.AllocDirFS, error) {

 	ar, ok := c.allocs[allocID]
 	if !ok {
-		return nil, fmt.Errorf("alloc not found")
+		return nil, fmt.Errorf("unknown allocation ID %q", allocID)
 	}
 	return ar.GetAllocDir(), nil
 }

+// GetClientAlloc returns the allocation from the client
+func (c *Client) GetClientAlloc(allocID string) (*structs.Allocation, error) {
+	all := c.allAllocs()
+	alloc, ok := all[allocID]
+	if !ok {
+		return nil, fmt.Errorf("unknown allocation ID %q", allocID)
+	}
+	return alloc, nil
+}
+
 // GetServers returns the list of nomad servers this client is aware of.
 func (c *Client) GetServers() []string {
 	endpoints := c.servers.all()
@ -2349,20 +2359,21 @@ func (c *Client) getAllocatedResources(selfNode *structs.Node) *structs.Resource
 }

 // allAllocs returns all the allocations managed by the client
-func (c *Client) allAllocs() []*structs.Allocation {
-	var allocs []*structs.Allocation
+func (c *Client) allAllocs() map[string]*structs.Allocation {
+	allocs := make(map[string]*structs.Allocation, 16)
 	for _, ar := range c.getAllocRunners() {
-		allocs = append(allocs, ar.Alloc())
+		a := ar.Alloc()
+		allocs[a.ID] = a
 	}
 	c.blockedAllocsLock.Lock()
 	for _, alloc := range c.blockedAllocations {
-		allocs = append(allocs, alloc)
+		allocs[alloc.ID] = alloc
 	}
 	c.blockedAllocsLock.Unlock()

 	c.migratingAllocsLock.Lock()
 	for _, ctrl := range c.migratingAllocs {
-		allocs = append(allocs, ctrl.alloc)
+		allocs[ctrl.alloc.ID] = ctrl.alloc
 	}
 	c.migratingAllocsLock.Unlock()
 	return allocs
--- a/command/agent/fs_endpoint.go
+++ b/command/agent/fs_endpoint.go
@ -761,6 +761,24 @@ func (s *HTTPServer) Logs(resp http.ResponseWriter, req *http.Request) (interfac
 		return nil, err
 	}

+	alloc, err := s.agent.client.GetClientAlloc(allocID)
+	if err != nil {
+		return nil, err
+	}
+
+	// Check that the task is there
+	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
+	if tg == nil {
+		return nil, fmt.Errorf("Failed to lookup task group for allocation")
+	} else if taskStruct := tg.LookupTask(task); taskStruct == nil {
+		return nil, CodedError(404, fmt.Sprintf("task group %q does not have task with name %q", alloc.TaskGroup, task))
+	}
+
+	state, ok := alloc.TaskStates[task]
+	if !ok || state.StartedAt.IsZero() {
+		return nil, CodedError(404, fmt.Sprintf("task %q not started yet. No logs available", task))
+	}
+
 	// Create an output that gets flushed on every write
 	output := ioutils.NewWriteFlusher(resp)

--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@ -2774,6 +2774,14 @@ type TaskState struct {
 	// Failed marks a task as having failed
 	Failed bool

+	// StartedAt is the time the task is started. It is updated each time the
+	// task starts
+	StartedAt time.Time
+
+	// FinishedAt is the time at which the task transistioned to dead and will
+	// not be started again.
+	FinishedAt time.Time
+
 	// Series of task events that transition the state of the task.
 	Events []*TaskEvent
 }
@ -2785,6 +2793,8 @@ func (ts *TaskState) Copy() *TaskState {
 	copy := new(TaskState)
 	copy.State = ts.State
 	copy.Failed = ts.Failed
+	copy.StartedAt = ts.StartedAt
+	copy.FinishedAt = ts.FinishedAt

 	if ts.Events != nil {
 		copy.Events = make([]*TaskEvent, len(ts.Events))
--- a/website/source/docs/http/alloc.html.md
+++ b/website/source/docs/http/alloc.html.md
@ -225,6 +225,8 @@ be specified using the `?region=` query parameter.
            }
          ],
          "State": "running"
+          "FinishedAt": "0001-01-01T00:00:00Z",
+          "StartedAt": "2017-03-31T22:51:40.248633594Z",
          "Failed": false,
        }
      },
@ -247,6 +249,10 @@ be specified using the `?region=` query parameter.
    * `TaskStateRunning` - The task is currently running.
    * `TaskStateDead` - The task is dead and will not run again.

+    Further the state contains the `StartedAt` and `FinishedAt` times of the
+    task. `StartedAt` can be updated multiple times if the task restarts but
+    `FinishedAt` is set only when the task transistions to `TaskStateDead`
+
    <p>The latest 10 events are stored per task. Each event is timestamped (unix nano-seconds)
    and has one of the following types:</p>