command/lock: Add -child-exitcode, return 2 on child error (#3329)

* Exit 2 if -child-exit-code and the child returned with an error.
* There is no platform independent way to check the exact return code of
* the child, so on error always return 2.
* Closes #947
* Closes #1503
This commit is contained in:
James Phillips 2017-07-26 22:09:19 -07:00 committed by GitHub
parent 005c9ad12a
commit 9b70009ab8
3 changed files with 64 additions and 14 deletions

View File

@ -41,8 +41,7 @@ type LockCommand struct {
child *os.Process
childLock sync.Mutex
verbose bool
verbose bool
}
func (c *LockCommand) Help() string {
@ -75,14 +74,18 @@ func (c *LockCommand) Run(args []string) int {
}
func (c *LockCommand) run(args []string, lu **LockUnlock) int {
var childDone chan struct{}
var limit int
var monitorRetry int
var name string
var passStdin bool
var propagateChildCode bool
var timeout time.Duration
f := c.BaseCommand.NewFlagSet(c)
f.BoolVar(&propagateChildCode, "child-exit-code", false,
"Exit 2 if the child process exited with an error if this is true, "+
"otherwise this doesn't propagate an error from the child. The "+
"default value is false.")
f.IntVar(&limit, "n", 1,
"Optional limit on the number of concurrent lock holders. The underlying "+
"implementation switches from a lock to a semaphore when the value is "+
@ -100,7 +103,7 @@ func (c *LockCommand) run(args []string, lu **LockUnlock) int {
"Pass stdin to the child process.")
f.DurationVar(&timeout, "timeout", 0,
"Maximum amount of time to wait to acquire the lock, specified as a "+
"timestamp like \"1s\" or \"3h\". The default value is 0.")
"duration like \"1s\" or \"3h\". The default value is 0.")
f.BoolVar(&c.verbose, "verbose", false,
"Enable verbose (debugging) output.")
@ -185,6 +188,8 @@ func (c *LockCommand) run(args []string, lu **LockUnlock) int {
}
// Check if we were shutdown but managed to still acquire the lock
var childCode int
var childErr chan error
select {
case <-c.ShutdownCh:
c.UI.Error("Shutdown triggered during lock acquisition")
@ -193,11 +198,9 @@ func (c *LockCommand) run(args []string, lu **LockUnlock) int {
}
// Start the child process
childDone = make(chan struct{})
childErr = make(chan error, 1)
go func() {
if err := c.startChild(script, childDone, passStdin); err != nil {
c.UI.Error(fmt.Sprintf("%s", err))
}
childErr <- c.startChild(script, passStdin)
}()
// Monitor for shutdown, child termination, or lock loss
@ -210,7 +213,10 @@ func (c *LockCommand) run(args []string, lu **LockUnlock) int {
if c.verbose {
c.UI.Info("Lock lost, killing child")
}
case <-childDone:
case err := <-childErr:
if err != nil {
childCode = 2
}
if c.verbose {
c.UI.Info("Child terminated, releasing lock")
}
@ -220,8 +226,9 @@ func (c *LockCommand) run(args []string, lu **LockUnlock) int {
// Prevent starting a new child. The lock is never released
// after this point.
c.childLock.Lock()
// Kill any existing child
if err := c.killChild(childDone); err != nil {
if err := c.killChild(childErr); err != nil {
c.UI.Error(fmt.Sprintf("%s", err))
}
@ -243,6 +250,13 @@ RELEASE:
} else if c.verbose {
c.UI.Info("Cleanup succeeded")
}
// If we detected an error from the child process then we propagate
// that.
if propagateChildCode {
return childCode
}
return 0
}
@ -321,8 +335,7 @@ func (c *LockCommand) setupSemaphore(client *api.Client, limit int, prefix, name
// startChild is a long running routine used to start and
// wait for the child process to exit.
func (c *LockCommand) startChild(script string, doneCh chan struct{}, passStdin bool) error {
defer close(doneCh)
func (c *LockCommand) startChild(script string, passStdin bool) error {
if c.verbose {
c.UI.Info(fmt.Sprintf("Starting handler '%s'", script))
}
@ -373,7 +386,7 @@ func (c *LockCommand) startChild(script string, doneCh chan struct{}, passStdin
// termination.
// On Windows, the child is always hard terminated with a SIGKILL, even
// on the first attempt.
func (c *LockCommand) killChild(childDone chan struct{}) error {
func (c *LockCommand) killChild(childErr chan error) error {
// Get the child process
child := c.child
@ -395,7 +408,7 @@ func (c *LockCommand) killChild(childDone chan struct{}) error {
// Wait for termination, or until a timeout
select {
case <-childDone:
case <-childErr:
if c.verbose {
c.UI.Info("Child terminated")
}

View File

@ -257,3 +257,33 @@ func TestLockCommand_MonitorRetry_Semaphore_Arg(t *testing.T) {
t.Fatalf("bad: %#v", opts)
}
}
func TestLockCommand_ChildExitCode(t *testing.T) {
t.Parallel()
a := agent.NewTestAgent(t.Name(), nil)
defer a.Shutdown()
t.Run("clean exit", func(t *testing.T) {
_, c := testLockCommand(t)
args := []string{"-http-addr=" + a.HTTPAddr(), "-child-exit-code", "test/prefix", "exit 0"}
if got, want := c.Run(args), 0; got != want {
t.Fatalf("got %d want %d", got, want)
}
})
t.Run("error exit", func(t *testing.T) {
_, c := testLockCommand(t)
args := []string{"-http-addr=" + a.HTTPAddr(), "-child-exit-code", "test/prefix", "exit 1"}
if got, want := c.Run(args), 2; got != want {
t.Fatalf("got %d want %d", got, want)
}
})
t.Run("not propagated", func(t *testing.T) {
_, c := testLockCommand(t)
args := []string{"-http-addr=" + a.HTTPAddr(), "test/prefix", "exit 1"}
if got, want := c.Run(args), 0; got != want {
t.Fatalf("got %d want %d", got, want)
}
})
}

View File

@ -52,6 +52,10 @@ Windows has no POSIX compatible notion for `SIGTERM`.
#### Command Options
* `-child-exit-code` - Exit 2 if the child process exited with an error
if this is true, otherwise this doesn't propagate an error from the
child. The default value is false.
* `-monitor-retry` - Retry up to this number of times if Consul returns a 500 error
while monitoring the lock. This allows riding out brief periods of unavailability
without causing leader elections, but increases the amount of time required
@ -67,6 +71,9 @@ Windows has no POSIX compatible notion for `SIGTERM`.
* `-pass-stdin` - Pass stdin to child process.
* `timeout` - Maximum amount of time to wait to acquire the lock, specified
as a duration like `1s` or `3h`. The default value is 0.
* `-try` - Attempt to acquire the lock up to the given timeout. The timeout is a
positive decimal number, with unit suffix, such as "500ms". Valid time units
are "ns", "us" (or "µs"), "ms", "s", "m", "h".