package spawn import ( "bytes" "encoding/json" "fmt" "io" "os" "os/exec" "strconv" "time" "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/client/driver/structs" "github.com/hashicorp/nomad/command" "github.com/hashicorp/nomad/helper/discover" ) // Spawner is used to start a user command in an isolated fashion that is // resistent to Nomad agent failure. type Spawner struct { spawn *os.Process SpawnPid int SpawnPpid int StateFile string UserPid int // User configuration UserCmd *exec.Cmd Logs *Logs Chroot string } // Logs is used to define the filepaths the user command's logs should be // redirected to. The files do not need to exist. type Logs struct { Stdin, Stdout, Stderr string } // NewSpawner takes a path to a state file. This state file can be used to // create a new Spawner that can be used to wait on the exit status of a // process even through Nomad restarts. func NewSpawner(stateFile string) *Spawner { return &Spawner{StateFile: stateFile} } // SetCommand sets the user command to spawn. func (s *Spawner) SetCommand(cmd *exec.Cmd) { s.UserCmd = cmd } // SetLogs sets the redirection of user command log files. func (s *Spawner) SetLogs(l *Logs) { s.Logs = l } // SetChroot puts the user command into a chroot. func (s *Spawner) SetChroot(root string) { s.Chroot = root } // Spawn does a double-fork to start and isolate the user command. It takes a // call-back that is invoked with the pid of the intermediary process. If the // call back returns an error, the user command is not started and the spawn is // cancelled. This can be used to put the process into a cgroup or jail and // cancel starting the user process if that was not successful. An error is // returned if the call-back returns an error or the user-command couldn't be // started. func (s *Spawner) Spawn(cb func(pid int) error) error { bin, err := discover.NomadExecutable() if err != nil { return fmt.Errorf("Failed to determine the nomad executable: %v", err) } exitFile, err := os.OpenFile(s.StateFile, os.O_CREATE|os.O_WRONLY, 0666) if err != nil { return fmt.Errorf("Error opening file to store exit status: %v", err) } defer exitFile.Close() config, err := s.spawnConfig() if err != nil { return err } spawn := exec.Command(bin, "spawn-daemon", config) // Capture stdout spawnStdout, err := spawn.StdoutPipe() if err != nil { return fmt.Errorf("Failed to capture spawn-daemon stdout: %v", err) } defer spawnStdout.Close() // Capture stdin. spawnStdin, err := spawn.StdinPipe() if err != nil { return fmt.Errorf("Failed to capture spawn-daemon stdin: %v", err) } defer spawnStdin.Close() if err := spawn.Start(); err != nil { return fmt.Errorf("Failed to call spawn-daemon on nomad executable: %v", err) } if cb != nil { cbErr := cb(spawn.Process.Pid) if cbErr != nil { errs := new(multierror.Error) errs = multierror.Append(errs, cbErr) if err := s.sendAbortCommand(spawnStdin); err != nil { errs = multierror.Append(errs, err) } return errs } } if err := s.sendStartCommand(spawnStdin); err != nil { return err } respCh := make(chan command.SpawnStartStatus, 1) errCh := make(chan error, 1) go func() { var resp command.SpawnStartStatus dec := json.NewDecoder(spawnStdout) if err := dec.Decode(&resp); err != nil { errCh <- fmt.Errorf("Failed to parse spawn-daemon start response: %v", err) } respCh <- resp }() select { case err := <-errCh: return err case resp := <-respCh: if resp.ErrorMsg != "" { return fmt.Errorf("Failed to execute user command: %s", resp.ErrorMsg) } s.UserPid = resp.UserPID case <-time.After(5 * time.Second): return fmt.Errorf("timed out waiting for response") } // Store the spawn process. s.spawn = spawn.Process s.SpawnPid = s.spawn.Pid s.SpawnPpid = os.Getpid() return nil } // spawnConfig returns a serialized config to pass to the Nomad spawn-daemon // command. func (s *Spawner) spawnConfig() (string, error) { if s.UserCmd == nil { return "", fmt.Errorf("Must specify user command") } config := command.DaemonConfig{ Cmd: *s.UserCmd, Chroot: s.Chroot, ExitStatusFile: s.StateFile, } if s.Logs != nil { config.StdoutFile = s.Logs.Stdout config.StdinFile = s.Logs.Stdin config.StderrFile = s.Logs.Stderr } var buffer bytes.Buffer enc := json.NewEncoder(&buffer) if err := enc.Encode(config); err != nil { return "", fmt.Errorf("Failed to serialize configuration: %v", err) } return strconv.Quote(buffer.String()), nil } // sendStartCommand sends the necessary command to the spawn-daemon to have it // start the user process. func (s *Spawner) sendStartCommand(w io.Writer) error { enc := json.NewEncoder(w) if err := enc.Encode(true); err != nil { return fmt.Errorf("Failed to serialize start command: %v", err) } return nil } // sendAbortCommand sends the necessary command to the spawn-daemon to have it // abort starting the user process. This should be invoked if the spawn-daemon // could not be isolated into a cgroup. func (s *Spawner) sendAbortCommand(w io.Writer) error { enc := json.NewEncoder(w) if err := enc.Encode(false); err != nil { return fmt.Errorf("Failed to serialize abort command: %v", err) } return nil } // Wait returns the exit code of the user process or an error if the wait // failed. func (s *Spawner) Wait() *structs.WaitResult { if os.Getpid() == s.SpawnPpid { return s.waitAsParent() } return s.pollWait() } // waitAsParent waits on the process if the current process was the spawner. func (s *Spawner) waitAsParent() *structs.WaitResult { if s.SpawnPpid != os.Getpid() { return structs.NewWaitResult(-1, 0, fmt.Errorf("not the parent. Spawner parent is %v; current pid is %v", s.SpawnPpid, os.Getpid())) } // Try to reattach to the spawn. if s.spawn == nil { // If it can't be reattached, it means the spawn process has exited so // we should just read its exit file. var err error if s.spawn, err = os.FindProcess(s.SpawnPid); err != nil { return s.pollWait() } } if _, err := s.spawn.Wait(); err != nil { return structs.NewWaitResult(-1, 0, err) } return s.pollWait() } // pollWait polls on the spawn daemon to determine when it exits. After it // exits, it reads the state file and returns the exit code and possibly an // error. func (s *Spawner) pollWait() *structs.WaitResult { // Stat to check if it is there to avoid a race condition. stat, err := os.Stat(s.StateFile) if err != nil { return structs.NewWaitResult(-1, 0, fmt.Errorf("Failed to Stat exit status file %v: %v", s.StateFile, err)) } // If there is data it means that the file has already been written. if stat.Size() > 0 { return s.readExitCode() } // Read after the process exits. ticker := time.NewTicker(5 * time.Second) defer ticker.Stop() for range ticker.C { if !s.Alive() { break } } return s.readExitCode() } // readExitCode parses the state file and returns the exit code of the task. It // returns an error if the file can't be read. func (s *Spawner) readExitCode() *structs.WaitResult { f, err := os.Open(s.StateFile) if err != nil { return structs.NewWaitResult(-1, 0, fmt.Errorf("Failed to open %v to read exit code: %v", s.StateFile, err)) } defer f.Close() stat, err := f.Stat() if err != nil { return structs.NewWaitResult(-1, 0, fmt.Errorf("Failed to stat file %v: %v", s.StateFile, err)) } if stat.Size() == 0 { return structs.NewWaitResult(-1, 0, fmt.Errorf("Empty state file: %v", s.StateFile)) } var exitStatus command.SpawnExitStatus dec := json.NewDecoder(f) if err := dec.Decode(&exitStatus); err != nil { return structs.NewWaitResult(-1, 0, fmt.Errorf("Failed to parse exit status from %v: %v", s.StateFile, err)) } return structs.NewWaitResult(exitStatus.ExitCode, 0, nil) } // Valid checks that the state of the Spawner is valid and that a subsequent // Wait could be called. This is useful to call when reopening a Spawner // through client restarts. If Valid a nil error is returned. func (s *Spawner) Valid() error { // If the spawner is still alive, then the task is running and we can wait // on it. if s.Alive() { return nil } // The task isn't alive so check that there is a valid exit code file. if res := s.readExitCode(); res.Err == nil { return nil } return fmt.Errorf("Spawner not alive and exit code not written") }