safe but slow

This commit is contained in:
Alex Dadgar 2016-02-10 13:44:53 -08:00
parent 071216a730
commit 0c4c3fc4ee
8 changed files with 129 additions and 55 deletions

View file

@ -31,16 +31,15 @@ type AllocRunner struct {
logger *log.Logger logger *log.Logger
consulService *ConsulService consulService *ConsulService
alloc *structs.Allocation alloc *structs.Allocation
allocLock sync.Mutex allocClientStatus string // Explicit status of allocation. Set when there are failures
// Explicit status of allocation. Set when there are failures
allocClientStatus string
allocClientDescription string allocClientDescription string
allocLock sync.Mutex
dirtyCh chan struct{} dirtyCh chan struct{}
ctx *driver.ExecContext ctx *driver.ExecContext
ctxLock sync.Mutex
tasks map[string]*TaskRunner tasks map[string]*TaskRunner
taskStates map[string]*structs.TaskState taskStates map[string]*structs.TaskState
restored map[string]struct{} restored map[string]struct{}
@ -76,7 +75,7 @@ func NewAllocRunner(logger *log.Logger, config *config.Config, updater AllocStat
consulService: consulService, consulService: consulService,
dirtyCh: make(chan struct{}, 1), dirtyCh: make(chan struct{}, 1),
tasks: make(map[string]*TaskRunner), tasks: make(map[string]*TaskRunner),
taskStates: alloc.TaskStates, taskStates: copyTaskStates(alloc.TaskStates),
restored: make(map[string]struct{}), restored: make(map[string]struct{}),
updateCh: make(chan *structs.Allocation, 8), updateCh: make(chan *structs.Allocation, 8),
destroyCh: make(chan struct{}), destroyCh: make(chan struct{}),
@ -112,7 +111,7 @@ func (r *AllocRunner) RestoreState() error {
r.restored[name] = struct{}{} r.restored[name] = struct{}{}
task := &structs.Task{Name: name} task := &structs.Task{Name: name}
tr := NewTaskRunner(r.logger, r.config, r.setTaskState, r.ctx, r.alloc, tr := NewTaskRunner(r.logger, r.config, r.setTaskState, r.ctx, r.Alloc(),
task, r.consulService) task, r.consulService)
r.tasks[name] = tr r.tasks[name] = tr
@ -153,16 +152,27 @@ func (r *AllocRunner) SaveState() error {
} }
func (r *AllocRunner) saveAllocRunnerState() error { func (r *AllocRunner) saveAllocRunnerState() error {
// Create the snapshot.
r.taskStatusLock.RLock() r.taskStatusLock.RLock()
defer r.taskStatusLock.RUnlock() states := copyTaskStates(r.taskStates)
r.taskStatusLock.RUnlock()
alloc := r.Alloc()
r.allocLock.Lock() r.allocLock.Lock()
defer r.allocLock.Unlock() allocClientStatus := r.allocClientStatus
allocClientDescription := r.allocClientDescription
r.allocLock.Unlock()
r.ctxLock.Lock()
ctx := r.ctx
r.ctxLock.Unlock()
snap := allocRunnerState{ snap := allocRunnerState{
Alloc: r.alloc, Alloc: alloc,
Context: r.ctx, Context: ctx,
AllocClientStatus: r.allocClientStatus, AllocClientStatus: allocClientStatus,
AllocClientDescription: r.allocClientDescription, AllocClientDescription: allocClientDescription,
TaskStates: r.taskStates, TaskStates: states,
} }
return persistState(r.stateFilePath(), &snap) return persistState(r.stateFilePath(), &snap)
} }
@ -186,16 +196,33 @@ func (r *AllocRunner) DestroyContext() error {
return r.ctx.AllocDir.Destroy() return r.ctx.AllocDir.Destroy()
} }
// copyTaskStates returns a copy of the passed task states.
func copyTaskStates(states map[string]*structs.TaskState) map[string]*structs.TaskState {
copy := make(map[string]*structs.TaskState, len(states))
for task, state := range states {
copy[task] = state.Copy()
}
return copy
}
// Alloc returns the associated allocation // Alloc returns the associated allocation
func (r *AllocRunner) Alloc() *structs.Allocation { func (r *AllocRunner) Alloc() *structs.Allocation {
r.allocLock.Lock() r.allocLock.Lock()
alloc := r.alloc.Copy() alloc := r.alloc.Copy()
// The status has explicitely been set.
if r.allocClientStatus != "" || r.allocClientDescription != "" {
alloc.ClientStatus = r.allocClientStatus
alloc.ClientDescription = r.allocClientDescription
r.allocLock.Unlock()
return alloc
}
r.allocLock.Unlock() r.allocLock.Unlock()
// Scan the task states to determine the status of the alloc // Scan the task states to determine the status of the alloc
var pending, running, dead, failed bool var pending, running, dead, failed bool
r.taskStatusLock.RLock() r.taskStatusLock.RLock()
alloc.TaskStates = r.taskStates alloc.TaskStates = copyTaskStates(r.taskStates)
for _, state := range r.taskStates { for _, state := range r.taskStates {
switch state.State { switch state.State {
case structs.TaskStateRunning: case structs.TaskStateRunning:
@ -213,13 +240,6 @@ func (r *AllocRunner) Alloc() *structs.Allocation {
} }
r.taskStatusLock.RUnlock() r.taskStatusLock.RUnlock()
// The status has explicitely been set.
if r.allocClientStatus != "" || r.allocClientDescription != "" {
alloc.ClientStatus = r.allocClientStatus
alloc.ClientDescription = r.allocClientDescription
return alloc
}
// Determine the alloc status // Determine the alloc status
if failed { if failed {
alloc.ClientStatus = structs.AllocClientStatusFailed alloc.ClientStatus = structs.AllocClientStatusFailed
@ -276,8 +296,10 @@ func (r *AllocRunner) syncStatus() error {
// setStatus is used to update the allocation status // setStatus is used to update the allocation status
func (r *AllocRunner) setStatus(status, desc string) { func (r *AllocRunner) setStatus(status, desc string) {
r.alloc.ClientStatus = status r.allocLock.Lock()
r.alloc.ClientDescription = desc r.allocClientStatus = status
r.allocClientDescription = desc
r.allocLock.Unlock()
select { select {
case r.dirtyCh <- struct{}{}: case r.dirtyCh <- struct{}{}:
default: default:
@ -336,6 +358,7 @@ func (r *AllocRunner) Run() {
} }
// Create the execution context // Create the execution context
r.ctxLock.Lock()
if r.ctx == nil { if r.ctx == nil {
allocDir := allocdir.NewAllocDir(filepath.Join(r.config.AllocDir, r.alloc.ID)) allocDir := allocdir.NewAllocDir(filepath.Join(r.config.AllocDir, r.alloc.ID))
if err := allocDir.Build(tg.Tasks); err != nil { if err := allocDir.Build(tg.Tasks); err != nil {
@ -345,6 +368,7 @@ func (r *AllocRunner) Run() {
} }
r.ctx = driver.NewExecContext(allocDir, r.alloc.ID) r.ctx = driver.NewExecContext(allocDir, r.alloc.ID)
} }
r.ctxLock.Unlock()
// Check if the allocation is in a terminal status. In this case, we don't // Check if the allocation is in a terminal status. In this case, we don't
// start any of the task runners and directly wait for the destroy signal to // start any of the task runners and directly wait for the destroy signal to
@ -364,8 +388,8 @@ func (r *AllocRunner) Run() {
continue continue
} }
tr := NewTaskRunner(r.logger, r.config, r.setTaskState, r.ctx, r.alloc, tr := NewTaskRunner(r.logger, r.config, r.setTaskState, r.ctx, r.Alloc(),
task, r.consulService) task.Copy(), r.consulService)
r.tasks[task.Name] = tr r.tasks[task.Name] = tr
go tr.Run() go tr.Run()
} }

View file

@ -72,8 +72,9 @@ func DefaultConfig() *config.Config {
// are expected to register as a schedulable node to the servers, and to // are expected to register as a schedulable node to the servers, and to
// run allocations as determined by the servers. // run allocations as determined by the servers.
type Client struct { type Client struct {
config *config.Config config *config.Config
start time.Time configLock sync.RWMutex
start time.Time
logger *log.Logger logger *log.Logger
@ -409,7 +410,9 @@ func (c *Client) restoreState() error {
for _, entry := range list { for _, entry := range list {
id := entry.Name() id := entry.Name()
alloc := &structs.Allocation{ID: id} alloc := &structs.Allocation{ID: id}
ar := NewAllocRunner(c.logger, c.config, c.updateAllocStatus, alloc, c.consulService) c.configLock.RLock()
ar := NewAllocRunner(c.logger, c.config.Copy(), c.updateAllocStatus, alloc, c.consulService)
c.configLock.RUnlock()
c.allocs[id] = ar c.allocs[id] = ar
if err := ar.RestoreState(); err != nil { if err := ar.RestoreState(); err != nil {
c.logger.Printf("[ERR] client: failed to restore state for alloc %s: %v", id, err) c.logger.Printf("[ERR] client: failed to restore state for alloc %s: %v", id, err)
@ -524,7 +527,10 @@ func (c *Client) fingerprint() error {
if err != nil { if err != nil {
return err return err
} }
applies, err := fingerprint.FingerprintLocked(f, c.config, c.config.Node)
c.configLock.Lock()
applies, err := f.Fingerprint(c.config, c.config.Node)
c.configLock.Unlock()
if err != nil { if err != nil {
return err return err
} }
@ -552,9 +558,11 @@ func (c *Client) fingerprintPeriodic(name string, f fingerprint.Fingerprint, d t
for { for {
select { select {
case <-time.After(d): case <-time.After(d):
if _, err := fingerprint.FingerprintLocked(f, c.config, c.config.Node); err != nil { c.configLock.Lock()
if _, err := f.Fingerprint(c.config, c.config.Node); err != nil {
c.logger.Printf("[DEBUG] client: periodic fingerprinting for %v failed: %v", name, err) c.logger.Printf("[DEBUG] client: periodic fingerprinting for %v failed: %v", name, err)
} }
c.configLock.Unlock()
case <-c.shutdownCh: case <-c.shutdownCh:
return return
} }
@ -582,7 +590,9 @@ func (c *Client) setupDrivers() error {
if err != nil { if err != nil {
return err return err
} }
applies, err := fingerprint.FingerprintLocked(d, c.config, c.config.Node) c.configLock.Lock()
applies, err := d.Fingerprint(c.config, c.config.Node)
c.configLock.Unlock()
if err != nil { if err != nil {
return err return err
} }
@ -664,6 +674,8 @@ func (c *Client) run() {
// determine if the node properties have changed. It returns the new hash values // determine if the node properties have changed. It returns the new hash values
// in case they are different from the old hash values. // in case they are different from the old hash values.
func (c *Client) hasNodeChanged(oldAttrHash uint64, oldMetaHash uint64) (bool, uint64, uint64) { func (c *Client) hasNodeChanged(oldAttrHash uint64, oldMetaHash uint64) (bool, uint64, uint64) {
c.configLock.RLock()
defer c.configLock.RUnlock()
newAttrHash, err := hashstructure.Hash(c.config.Node.Attributes, nil) newAttrHash, err := hashstructure.Hash(c.config.Node.Attributes, nil)
if err != nil { if err != nil {
c.logger.Printf("[DEBUG] client: unable to calculate node attributes hash: %v", err) c.logger.Printf("[DEBUG] client: unable to calculate node attributes hash: %v", err)
@ -919,7 +931,7 @@ func (c *Client) runAllocs(update *allocUpdates) {
c.allocLock.RLock() c.allocLock.RLock()
exist := make([]*structs.Allocation, 0, len(c.allocs)) exist := make([]*structs.Allocation, 0, len(c.allocs))
for _, ar := range c.allocs { for _, ar := range c.allocs {
exist = append(exist, ar.Alloc()) exist = append(exist, ar.alloc)
} }
c.allocLock.RUnlock() c.allocLock.RUnlock()
@ -988,7 +1000,9 @@ func (c *Client) updateAlloc(exist, update *structs.Allocation) error {
func (c *Client) addAlloc(alloc *structs.Allocation) error { func (c *Client) addAlloc(alloc *structs.Allocation) error {
c.allocLock.Lock() c.allocLock.Lock()
defer c.allocLock.Unlock() defer c.allocLock.Unlock()
ar := NewAllocRunner(c.logger, c.config, c.updateAllocStatus, alloc, c.consulService) c.configLock.RLock()
ar := NewAllocRunner(c.logger, c.config.Copy(), c.updateAllocStatus, alloc, c.consulService)
c.configLock.RUnlock()
c.allocs[alloc.ID] = ar c.allocs[alloc.ID] = ar
go ar.Run() go ar.Run()
return nil return nil

View file

@ -8,6 +8,7 @@ import (
"time" "time"
"github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/nomad/structs"
"github.com/mitchellh/copystructure"
) )
// RPCHandler can be provided to the Client if there is a local server // RPCHandler can be provided to the Client if there is a local server
@ -72,6 +73,19 @@ type Config struct {
Options map[string]string Options map[string]string
} }
func (c *Config) Copy() *Config {
log := c.LogOutput
c.LogOutput = nil
i, err := copystructure.Copy(c)
c.LogOutput = log
if err != nil {
return nil
}
copy := i.(*Config)
copy.LogOutput = log
return copy
}
// Read returns the specified configuration value or "". // Read returns the specified configuration value or "".
func (c *Config) Read(id string) string { func (c *Config) Read(id string) string {
val, ok := c.Options[id] val, ok := c.Options[id]

View file

@ -3,7 +3,6 @@ package fingerprint
import ( import (
"fmt" "fmt"
"log" "log"
"sync"
"time" "time"
"github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/client/config"
@ -28,12 +27,6 @@ var BuiltinFingerprints = []string{
"storage", "storage",
} }
var (
// NodeLock ensures that only a single fingerprinter is running at a time
// when using the FingerprintLocked method.
NodeLock sync.Mutex
)
// builtinFingerprintMap contains the built in registered fingerprints // builtinFingerprintMap contains the built in registered fingerprints
// which are available, corresponding to a key found in BuiltinFingerprints // which are available, corresponding to a key found in BuiltinFingerprints
var builtinFingerprintMap = map[string]Factory{ var builtinFingerprintMap = map[string]Factory{
@ -88,10 +81,3 @@ type StaticFingerprinter struct{}
func (s *StaticFingerprinter) Periodic() (bool, time.Duration) { func (s *StaticFingerprinter) Periodic() (bool, time.Duration) {
return false, EmptyDuration return false, EmptyDuration
} }
// FingerprintLocked is used to fingerprint in a thread-safe manner.
func FingerprintLocked(f Fingerprint, config *config.Config, node *structs.Node) (bool, error) {
NodeLock.Lock()
defer NodeLock.Unlock()
return f.Fingerprint(config, node)
}

View file

@ -29,9 +29,10 @@ type TaskRunner struct {
restartTracker *RestartTracker restartTracker *RestartTracker
consulService *ConsulService consulService *ConsulService
task *structs.Task task *structs.Task
updateCh chan *structs.Allocation updateCh chan *structs.Allocation
handle driver.DriverHandle handle driver.DriverHandle
handleLock sync.Mutex
destroy bool destroy bool
destroyCh chan struct{} destroyCh chan struct{}
@ -127,7 +128,9 @@ func (r *TaskRunner) RestoreState() error {
r.task.Name, r.alloc.ID, err) r.task.Name, r.alloc.ID, err)
return nil return nil
} }
r.handleLock.Lock()
r.handle = handle r.handle = handle
r.handleLock.Unlock()
} }
return nil return nil
} }
@ -139,9 +142,11 @@ func (r *TaskRunner) SaveState() error {
snap := taskRunnerState{ snap := taskRunnerState{
Task: r.task, Task: r.task,
} }
r.handleLock.Lock()
if r.handle != nil { if r.handle != nil {
snap.HandleID = r.handle.ID() snap.HandleID = r.handle.ID()
} }
r.handleLock.Unlock()
return persistState(r.stateFilePath(), &snap) return persistState(r.stateFilePath(), &snap)
} }
@ -163,7 +168,10 @@ func (r *TaskRunner) setState(state string, event *structs.TaskEvent) {
// createDriver makes a driver for the task // createDriver makes a driver for the task
func (r *TaskRunner) createDriver() (driver.Driver, error) { func (r *TaskRunner) createDriver() (driver.Driver, error) {
taskEnv, err := driver.GetTaskEnv(r.ctx.AllocDir, r.config.Node, r.task) // Create a copy of the node.
// TODO REMOVE
node := r.config.Node.Copy()
taskEnv, err := driver.GetTaskEnv(r.ctx.AllocDir, node, r.task)
if err != nil { if err != nil {
err = fmt.Errorf("failed to create driver '%s' for alloc %s: %v", err = fmt.Errorf("failed to create driver '%s' for alloc %s: %v",
r.task.Driver, r.alloc.ID, err) r.task.Driver, r.alloc.ID, err)
@ -203,7 +211,9 @@ func (r *TaskRunner) startTask() error {
r.setState(structs.TaskStateDead, e) r.setState(structs.TaskStateDead, e)
return err return err
} }
r.handleLock.Lock()
r.handle = handle r.handle = handle
r.handleLock.Unlock()
r.setState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted)) r.setState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted))
return nil return nil
} }
@ -222,7 +232,10 @@ func (r *TaskRunner) run() {
var forceStart bool var forceStart bool
for { for {
// Start the task if not yet started or it is being forced. // Start the task if not yet started or it is being forced.
if r.handle == nil || forceStart { r.handleLock.Lock()
handleEmpty := r.handle == nil
r.handleLock.Unlock()
if handleEmpty || forceStart {
forceStart = false forceStart = false
if err := r.startTask(); err != nil { if err := r.startTask(); err != nil {
return return
@ -339,11 +352,13 @@ func (r *TaskRunner) handleUpdate(update *structs.Allocation) error {
// Update will update resources and store the new kill timeout. // Update will update resources and store the new kill timeout.
var mErr multierror.Error var mErr multierror.Error
r.handleLock.Lock()
if r.handle != nil { if r.handle != nil {
if err := r.handle.Update(updatedTask); err != nil { if err := r.handle.Update(updatedTask); err != nil {
mErr.Errors = append(mErr.Errors, fmt.Errorf("updating task resources failed: %v", err)) mErr.Errors = append(mErr.Errors, fmt.Errorf("updating task resources failed: %v", err))
} }
} }
r.handleLock.Unlock()
// Update the restart policy. // Update the restart policy.
if r.restartTracker != nil { if r.restartTracker != nil {

View file

@ -1,9 +1,11 @@
# Increase log verbosity # Increase log verbosity
log_level = "DEBUG" log_level = "INFO"
# Setup data dir # Setup data dir
data_dir = "/tmp/client1" data_dir = "/tmp/client1"
enable_debug = true
# Enable the client # Enable the client
client { client {
enabled = true enabled = true
@ -13,6 +15,9 @@ client {
# like Consul used for service discovery. # like Consul used for service discovery.
servers = ["127.0.0.1:4647"] servers = ["127.0.0.1:4647"]
node_class = "foo" node_class = "foo"
options {
"driver.raw_exec.enable" = "1"
}
} }
# Modify our port to avoid a collision with server1 # Modify our port to avoid a collision with server1

View file

@ -1,5 +1,5 @@
# Increase log verbosity # Increase log verbosity
log_level = "DEBUG" log_level = "INFO"
# Setup data dir # Setup data dir
data_dir = "/tmp/server1" data_dir = "/tmp/server1"

View file

@ -542,6 +542,14 @@ type Node struct {
ModifyIndex uint64 ModifyIndex uint64
} }
func (n *Node) Copy() *Node {
i, err := copystructure.Copy(n)
if err != nil {
return nil
}
return i.(*Node)
}
// TerminalStatus returns if the current status is terminal and // TerminalStatus returns if the current status is terminal and
// will no longer transition. // will no longer transition.
func (n *Node) TerminalStatus() bool { func (n *Node) TerminalStatus() bool {
@ -1478,6 +1486,14 @@ type Task struct {
KillTimeout time.Duration `mapstructure:"kill_timeout"` KillTimeout time.Duration `mapstructure:"kill_timeout"`
} }
func (t *Task) Copy() *Task {
i, err := copystructure.Copy(t)
if err != nil {
return nil
}
return i.(*Task)
}
// InitFields initializes fields in the task. // InitFields initializes fields in the task.
func (t *Task) InitFields(job *Job, tg *TaskGroup) { func (t *Task) InitFields(job *Job, tg *TaskGroup) {
t.InitServiceFields(job.Name, tg.Name) t.InitServiceFields(job.Name, tg.Name)