Adds monitor retries to the consul lock command.

2016-01-05 17:59:58 -08:00 · 2016-01-05 17:59:58 -08:00 · 9323276b58
parent 169825ed90
commit 9323276b58
3 changed files with 177 additions and 75 deletions
--- a/command/lock.go
+++ b/command/lock.go
@ -69,17 +69,29 @@ Options:
  -pass-stdin                Pass stdin to child process.
  -try=duration              Make a single attempt to acquire the lock, waiting
                             up to the given duration (eg. "15s").
+  -monitor-retry=n           Retry up to n times if Consul returns a 500 error
+                             while monitoring the lock. This allows riding out brief
+                             periods of unavailability without causing leader
+                             elections, but increases the amount of time required
+                             to detect a lost lock in some cases. Defaults to 0.
  -verbose                   Enables verbose output
 `
 	return strings.TrimSpace(helpText)
 }

 func (c *LockCommand) Run(args []string) int {
+	var lu *LockUnlock
+	return c.run(args, &lu)
+}
+
+// run exposes the underlying lock for testing.
+func (c *LockCommand) run(args []string, lu **LockUnlock) int {
 	var childDone chan struct{}
 	var name, token string
 	var limit int
 	var passStdin bool
 	var try string
+	var retry int
 	cmdFlags := flag.NewFlagSet("watch", flag.ContinueOnError)
 	cmdFlags.Usage = func() { c.Ui.Output(c.Help()) }
 	cmdFlags.IntVar(&limit, "n", 1, "")
@ -87,6 +99,7 @@ func (c *LockCommand) Run(args []string) int {
 	cmdFlags.StringVar(&token, "token", "", "")
 	cmdFlags.BoolVar(&passStdin, "pass-stdin", false, "")
 	cmdFlags.StringVar(&try, "try", "", "")
+	cmdFlags.IntVar(&retry, "monitor-retry", 0, "")
 	cmdFlags.BoolVar(&c.verbose, "verbose", false, "")
 	httpAddr := HTTPAddrFlag(cmdFlags)
 	if err := cmdFlags.Parse(args); err != nil {
@ -135,6 +148,12 @@ func (c *LockCommand) Run(args []string) int {
 		oneshot = true
 	}

+	// Check the retry parameter
+	if retry < 0 {
+		c.Ui.Error("Number for 'monitor-retry' must be >= 0")
+		return 1
+	}
+
 	// Create and test the HTTP client
 	conf := api.DefaultConfig()
 	conf.Address = *httpAddr
@ -151,11 +170,10 @@ func (c *LockCommand) Run(args []string) int {
 	}

 	// Setup the lock or semaphore
-	var lu *LockUnlock
 	if limit == 1 {
-		lu, err = c.setupLock(client, prefix, name, oneshot, wait)
+		*lu, err = c.setupLock(client, prefix, name, oneshot, wait, retry)
 	} else {
-		lu, err = c.setupSemaphore(client, limit, prefix, name, oneshot, wait)
+		*lu, err = c.setupSemaphore(client, limit, prefix, name, oneshot, wait, retry)
 	}
 	if err != nil {
 		c.Ui.Error(fmt.Sprintf("Lock setup failed: %s", err))
@ -166,7 +184,7 @@ func (c *LockCommand) Run(args []string) int {
 	if c.verbose {
 		c.Ui.Info("Attempting lock acquisition")
 	}
-	lockCh, err := lu.lockFn(c.ShutdownCh)
+	lockCh, err := (*lu).lockFn(c.ShutdownCh)
 	if lockCh == nil {
 		if err == nil {
 			c.Ui.Error("Shutdown triggered or timeout during lock acquisition")
@ -219,14 +237,14 @@ func (c *LockCommand) Run(args []string) int {

 RELEASE:
 	// Release the lock before termination
-	if err := lu.unlockFn(); err != nil {
+	if err := (*lu).unlockFn(); err != nil {
 		c.Ui.Error(fmt.Sprintf("Lock release failed: %s", err))
 		return 1
 	}

 	// Cleanup the lock if no longer in use
-	if err := lu.cleanupFn(); err != nil {
-		if err != lu.inUseErr {
+	if err := (*lu).cleanupFn(); err != nil {
+		if err != (*lu).inUseErr {
 			c.Ui.Error(fmt.Sprintf("Lock cleanup failed: %s", err))
 			return 1
 		} else if c.verbose {
@ -240,8 +258,11 @@ RELEASE:

 // setupLock is used to setup a new Lock given the API client, the key prefix to
 // operate on, and an optional session name. If oneshot is true then we will set
-// up for a single attempt at acquisition, using the given wait time.
-func (c *LockCommand) setupLock(client *api.Client, prefix, name string, oneshot bool, wait time.Duration) (*LockUnlock, error) {
+// up for a single attempt at acquisition, using the given wait time. The retry
+// parameter sets how many 500 errors the lock monitor will tolerate before
+// giving up the lock.
+func (c *LockCommand) setupLock(client *api.Client, prefix, name string,
+	oneshot bool, wait time.Duration, retry int) (*LockUnlock, error) {
 	// Use the DefaultSemaphoreKey extension, this way if a lock and
 	// semaphore are both used at the same prefix, we will get a conflict
 	// which we can report to the user.
@ -250,8 +271,9 @@ func (c *LockCommand) setupLock(client *api.Client, prefix, name string, oneshot
 		c.Ui.Info(fmt.Sprintf("Setting up lock at path: %s", key))
 	}
 	opts := api.LockOptions{
-		Key:         key,
-		SessionName: name,
+		Key:            key,
+		SessionName:    name,
+		MonitorRetries: retry,
 	}
 	if oneshot {
 		opts.LockTryOnce = true
@ -266,21 +288,26 @@ func (c *LockCommand) setupLock(client *api.Client, prefix, name string, oneshot
 		unlockFn:  l.Unlock,
 		cleanupFn: l.Destroy,
 		inUseErr:  api.ErrLockInUse,
+		rawOpts:   &opts,
 	}
 	return lu, nil
 }

 // setupSemaphore is used to setup a new Semaphore given the API client, key
 // prefix, session name, and slot holder limit. If oneshot is true then we will
-// set up for a single attempt at acquisition, using the given wait time.
-func (c *LockCommand) setupSemaphore(client *api.Client, limit int, prefix, name string, oneshot bool, wait time.Duration) (*LockUnlock, error) {
+// set up for a single attempt at acquisition, using the given wait time. The
+// retry parameter sets how many 500 errors the lock monitor will tolerate
+// before giving up the semaphore.
+func (c *LockCommand) setupSemaphore(client *api.Client, limit int, prefix, name string,
+	oneshot bool, wait time.Duration, retry int) (*LockUnlock, error) {
 	if c.verbose {
 		c.Ui.Info(fmt.Sprintf("Setting up semaphore (limit %d) at prefix: %s", limit, prefix))
 	}
 	opts := api.SemaphoreOptions{
-		Prefix:      prefix,
-		Limit:       limit,
-		SessionName: name,
+		Prefix:         prefix,
+		Limit:          limit,
+		SessionName:    name,
+		MonitorRetries: retry,
 	}
 	if oneshot {
 		opts.SemaphoreTryOnce = true
@ -295,6 +322,7 @@ func (c *LockCommand) setupSemaphore(client *api.Client, limit int, prefix, name
 		unlockFn:  s.Release,
 		cleanupFn: s.Destroy,
 		inUseErr:  api.ErrSemaphoreInUse,
+		rawOpts:   &opts,
 	}
 	return lu, nil
 }
@ -408,4 +436,5 @@ type LockUnlock struct {
 	unlockFn  func() error
 	cleanupFn func() error
 	inUseErr  error
+	rawOpts   interface{}
 }
--- a/command/lock_test.go
+++ b/command/lock_test.go
@ -5,9 +5,10 @@ import (
 	"io/ioutil"
 	"path/filepath"
 	"strings"
-	"sync"
 	"testing"
+	"time"

+	"github.com/hashicorp/consul/api"
 	"github.com/mitchellh/cli"
 )

@ -15,17 +16,22 @@ func TestLockCommand_implements(t *testing.T) {
 	var _ cli.Command = &LockCommand{}
 }

-func TestLockCommand_BadArgs(t *testing.T) {
+func argFail(t *testing.T, args []string, expected string) {
 	ui := new(cli.MockUi)
 	c := &LockCommand{Ui: ui}
-
-	if code := c.Run([]string{"-try=blah"}); code != 1 {
+	if code := c.Run(args); code != 1 {
 		t.Fatalf("expected return code 1, got %d", code)
 	}
-
-	if code := c.Run([]string{"-try=-10s"}); code != 1 {
-		t.Fatalf("expected return code 1, got %d", code)
+	if reason := ui.ErrorWriter.String(); !strings.Contains(reason, expected) {
+		t.Fatalf("bad reason: got='%s', expected='%s'", reason, expected)
 	}
+
+}
+
+func TestLockCommand_BadArgs(t *testing.T) {
+	argFail(t, []string{"-try=blah", "test/prefix", "date"}, "parsing duration")
+	argFail(t, []string{"-try=-10s", "test/prefix", "date"}, "must be positive")
+	argFail(t, []string{"-monitor-retry=-5", "test/prefix", "date"}, "must be >= 0")
 }

 func TestLockCommand_Run(t *testing.T) {
@ -51,68 +57,130 @@ func TestLockCommand_Run(t *testing.T) {
 	}
 }

-func runTry(t *testing.T, n int) {
+func TestLockCommand_Try_Lock(t *testing.T) {
 	a1 := testAgent(t)
 	defer a1.Shutdown()
 	waitForLeader(t, a1.httpAddr)

-	// Define a long-running command.
-	nArg := fmt.Sprintf("-n=%d", n)
-	args := []string{"-http-addr=" + a1.httpAddr, nArg, "-try=250ms", "test/prefix", "sleep 2"}
+	ui := new(cli.MockUi)
+	c := &LockCommand{Ui: ui}
+	filePath := filepath.Join(a1.dir, "test_touch")
+	touchCmd := fmt.Sprintf("touch '%s'", filePath)
+	args := []string{"-http-addr=" + a1.httpAddr, "-try=10s", "test/prefix", touchCmd}

-	// Run several commands at once.
-	var wg sync.WaitGroup
-	locked := make([]bool, n+1)
-	tried := make([]bool, n+1)
-	for i := 0; i < n+1; i++ {
-		wg.Add(1)
-		go func(index int) {
-			ui := new(cli.MockUi)
-			c := &LockCommand{Ui: ui}
-
-			code := c.Run(append([]string{"-try=250ms"}, args...))
-			if code == 0 {
-				locked[index] = true
-			} else {
-				reason := ui.ErrorWriter.String()
-				if !strings.Contains(reason, "Shutdown triggered or timeout during lock acquisition") {
-					t.Fatalf("bad reason: %s", reason)
-				}
-				tried[index] = true
-			}
-			wg.Done()
-		}(i)
+	// Run the command.
+	var lu *LockUnlock
+	code := c.run(args, &lu)
+	if code != 0 {
+		t.Fatalf("bad: %d. %#v", code, ui.ErrorWriter.String())
 	}
-	wg.Wait()
-
-	// Tally up the outcomes.
-	totalLocked := 0
-	totalTried := 0
-	for i := 0; i < n+1; i++ {
-		if locked[i] == tried[i] {
-			t.Fatalf("command %d didn't lock or try, or did both", i+1)
-		}
-		if locked[i] {
-			totalLocked++
-		}
-		if tried[i] {
-			totalTried++
-		}
+	_, err := ioutil.ReadFile(filePath)
+	if err != nil {
+		t.Fatalf("err: %v", err)
 	}

-	// We can't check exact counts because sometimes the try attempts may
-	// fail because they get woken up but need to do another try, but we
-	// should get one of each outcome.
-	if totalLocked == 0 || totalTried == 0 {
-		t.Fatalf("unexpected outcome: locked=%d, tried=%d", totalLocked, totalTried)
+	// Make sure the try options were set correctly.
+	opts, ok := lu.rawOpts.(*api.LockOptions)
+	if !ok {
+		t.Fatalf("bad type")
+	}
+	if !opts.LockTryOnce || opts.LockWaitTime != 10*time.Second {
+		t.Fatalf("bad: %#v", opts)
 	}
-}
-
-func TestLockCommand_Try_Lock(t *testing.T) {
-	runTry(t, 1)
 }

 func TestLockCommand_Try_Semaphore(t *testing.T) {
-	runTry(t, 2)
-	runTry(t, 3)
+	a1 := testAgent(t)
+	defer a1.Shutdown()
+	waitForLeader(t, a1.httpAddr)
+
+	ui := new(cli.MockUi)
+	c := &LockCommand{Ui: ui}
+	filePath := filepath.Join(a1.dir, "test_touch")
+	touchCmd := fmt.Sprintf("touch '%s'", filePath)
+	args := []string{"-http-addr=" + a1.httpAddr, "-n=3", "-try=10s", "test/prefix", touchCmd}
+
+	// Run the command.
+	var lu *LockUnlock
+	code := c.run(args, &lu)
+	if code != 0 {
+		t.Fatalf("bad: %d. %#v", code, ui.ErrorWriter.String())
+	}
+	_, err := ioutil.ReadFile(filePath)
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+
+	// Make sure the try options were set correctly.
+	opts, ok := lu.rawOpts.(*api.SemaphoreOptions)
+	if !ok {
+		t.Fatalf("bad type")
+	}
+	if !opts.SemaphoreTryOnce || opts.SemaphoreWaitTime != 10*time.Second {
+		t.Fatalf("bad: %#v", opts)
+	}
+}
+
+func TestLockCommand_MonitorRetry_Lock(t *testing.T) {
+	a1 := testAgent(t)
+	defer a1.Shutdown()
+	waitForLeader(t, a1.httpAddr)
+
+	ui := new(cli.MockUi)
+	c := &LockCommand{Ui: ui}
+	filePath := filepath.Join(a1.dir, "test_touch")
+	touchCmd := fmt.Sprintf("touch '%s'", filePath)
+	args := []string{"-http-addr=" + a1.httpAddr, "-monitor-retry=3", "test/prefix", touchCmd}
+
+	// Run the command.
+	var lu *LockUnlock
+	code := c.run(args, &lu)
+	if code != 0 {
+		t.Fatalf("bad: %d. %#v", code, ui.ErrorWriter.String())
+	}
+	_, err := ioutil.ReadFile(filePath)
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+
+	// Make sure the monitor options were set correctly.
+	opts, ok := lu.rawOpts.(*api.LockOptions)
+	if !ok {
+		t.Fatalf("bad type")
+	}
+	if opts.MonitorRetries != 3 {
+		t.Fatalf("bad: %d", opts.MonitorRetries)
+	}
+}
+
+func TestLockCommand_MonitorRetry_Semaphore(t *testing.T) {
+	a1 := testAgent(t)
+	defer a1.Shutdown()
+	waitForLeader(t, a1.httpAddr)
+
+	ui := new(cli.MockUi)
+	c := &LockCommand{Ui: ui}
+	filePath := filepath.Join(a1.dir, "test_touch")
+	touchCmd := fmt.Sprintf("touch '%s'", filePath)
+	args := []string{"-http-addr=" + a1.httpAddr, "-n=3", "-monitor-retry=3", "test/prefix", touchCmd}
+
+	// Run the command.
+	var lu *LockUnlock
+	code := c.run(args, &lu)
+	if code != 0 {
+		t.Fatalf("bad: %d. %#v", code, ui.ErrorWriter.String())
+	}
+	_, err := ioutil.ReadFile(filePath)
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+
+	// Make sure the monitor options were set correctly.
+	opts, ok := lu.rawOpts.(*api.SemaphoreOptions)
+	if !ok {
+		t.Fatalf("bad type")
+	}
+	if opts.MonitorRetries != 3 {
+		t.Fatalf("bad: %d", opts.MonitorRetries)
+	}
 }
--- a/website/source/docs/commands/lock.html.markdown
+++ b/website/source/docs/commands/lock.html.markdown
@ -67,5 +67,10 @@ The list of available flags are:
  unit suffix, such as "500ms". Valid time units are "ns", "us" (or "µs"), "ms",
  "s", "m", "h".

+* `-monitor-retry` - Retry up to this number of times if Consul returns a 500 error
+   while monitoring the lock. This allows riding out brief periods of unavailability
+   without causing leader elections, but increases the amount of time required
+   to detect a lost lock in some cases. Defaults to 0.
+
 * `-verbose` - Enables verbose output.