Merge pull request #1146 from hashicorp/step-down

Provide 'sys/step-down' and 'vault step-down'
This commit is contained in:
Jeff Mitchell 2016-03-03 12:30:08 -05:00
commit 3e7bca82a1
11 changed files with 408 additions and 23 deletions

10
api/sys_stepdown.go Normal file
View File

@ -0,0 +1,10 @@
package api
func (c *Sys) StepDown() error {
r := c.c.NewRequest("PUT", "/v1/sys/step-down")
resp, err := c.c.RawRequest(r)
if err == nil {
defer resp.Body.Close()
}
return err
}

View File

@ -224,6 +224,12 @@ func Commands(metaPtr *command.Meta) map[string]cli.CommandFactory {
}, nil
},
"step-down": func() (cli.Command, error) {
return &command.StepDownCommand{
Meta: meta,
}, nil
},
"mount": func() (cli.Command, error) {
return &command.MountCommand{
Meta: meta,

54
command/step-down.go Normal file
View File

@ -0,0 +1,54 @@
package command
import (
"fmt"
"strings"
)
// StepDownCommand is a Command that seals the vault.
type StepDownCommand struct {
Meta
}
func (c *StepDownCommand) Run(args []string) int {
flags := c.Meta.FlagSet("step-down", FlagSetDefault)
flags.Usage = func() { c.Ui.Error(c.Help()) }
if err := flags.Parse(args); err != nil {
return 1
}
client, err := c.Client()
if err != nil {
c.Ui.Error(fmt.Sprintf(
"Error initializing client: %s", err))
return 2
}
if err := client.Sys().StepDown(); err != nil {
c.Ui.Error(fmt.Sprintf("Error stepping down: %s", err))
return 1
}
return 0
}
func (c *StepDownCommand) Synopsis() string {
return "Force the Vault node to give up active duty"
}
func (c *StepDownCommand) Help() string {
helpText := `
Usage: vault step-down [options]
Force the Vault node to step down from active duty.
This causes the indicated node to give up active status. Note that while the
affected node will have a short delay before attempting to grab the lock
again, if no other node grabs the lock beforehand, it is possible for the
same node to re-grab the lock and become active again.
General Options:
` + generalOptionsUsage()
return strings.TrimSpace(helpText)
}

View File

@ -23,6 +23,7 @@ func Handler(core *vault.Core) http.Handler {
mux.Handle("/v1/sys/init", handleSysInit(core))
mux.Handle("/v1/sys/seal-status", handleSysSealStatus(core))
mux.Handle("/v1/sys/seal", handleSysSeal(core))
mux.Handle("/v1/sys/step-down", handleSysStepDown(core))
mux.Handle("/v1/sys/unseal", handleSysUnseal(core))
mux.Handle("/v1/sys/renew/", handleLogical(core, false))
mux.Handle("/v1/sys/leader", handleSysLeader(core))

View File

@ -34,6 +34,29 @@ func handleSysSeal(core *vault.Core) http.Handler {
})
}
func handleSysStepDown(core *vault.Core) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.Method {
case "PUT":
case "POST":
default:
respondError(w, http.StatusMethodNotAllowed, nil)
return
}
// Get the auth for the request so we can access the token directly
req := requestAuth(r, &logical.Request{})
// Seal with the token above
if err := core.StepDown(req.ClientToken); err != nil {
respondError(w, http.StatusInternalServerError, err)
return
}
respondOk(w, nil)
})
}
func handleSysUnseal(core *vault.Core) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
switch r.Method {

View File

@ -304,3 +304,13 @@ func TestSysSeal_Permissions(t *testing.T) {
httpResp = testHttpPut(t, "child", addr+"/v1/sys/seal", nil)
testResponseStatus(t, httpResp, 204)
}
func TestSysStepDown(t *testing.T) {
core, _, token := vault.TestCoreUnsealed(t)
ln, addr := TestServer(t, core)
defer ln.Close()
TestServerAuth(t, addr, token)
resp := testHttpPut(t, token, addr+"/v1/sys/step-down", nil)
testResponseStatus(t, resp, 204)
}

View File

@ -64,6 +64,10 @@ const (
// leaderPrefixCleanDelay is how long to wait between deletions
// of orphaned leader keys, to prevent slamming the backend.
leaderPrefixCleanDelay = 200 * time.Millisecond
// manualStepDownSleepPeriod is how long to sleep after a user-initiated
// step down of the active node, to prevent instantly regrabbing the lock
manualStepDownSleepPeriod = 10 * time.Second
)
var (
@ -206,9 +210,10 @@ type Core struct {
stateLock sync.RWMutex
sealed bool
standby bool
standbyDoneCh chan struct{}
standbyStopCh chan struct{}
standby bool
standbyDoneCh chan struct{}
standbyStopCh chan struct{}
manualStepDownCh chan struct{}
// unlockParts has the keys provided to Unseal until
// the threshold number of parts is available.
@ -1149,7 +1154,8 @@ func (c *Core) Unseal(key []byte) (bool, error) {
// Go to standby mode, wait until we are active to unseal
c.standbyDoneCh = make(chan struct{})
c.standbyStopCh = make(chan struct{})
go c.runStandby(c.standbyDoneCh, c.standbyStopCh)
c.manualStepDownCh = make(chan struct{})
go c.runStandby(c.standbyDoneCh, c.standbyStopCh, c.manualStepDownCh)
}
// Success!
@ -1161,6 +1167,7 @@ func (c *Core) Unseal(key []byte) (bool, error) {
// be unsealed again to perform any further operations.
func (c *Core) Seal(token string) (retErr error) {
defer metrics.MeasureSince([]string{"core", "seal"}, time.Now())
c.stateLock.Lock()
defer c.stateLock.Unlock()
if c.sealed {
@ -1173,15 +1180,8 @@ func (c *Core) Seal(token string) (retErr error) {
Path: "sys/seal",
ClientToken: token,
}
acl, te, err := c.fetchACLandTokenEntry(req)
// Attempt to use the token (decrement num_uses)
if te != nil {
if err := c.tokenStore.UseToken(te); err != nil {
c.logger.Printf("[ERR] core: failed to use token: %v", err)
retErr = ErrInternalError
}
}
acl, te, err := c.fetchACLandTokenEntry(req)
if err != nil {
// Since there is no token store in standby nodes, sealing cannot
// be done. Ideally, the request has to be forwarded to leader node
@ -1189,11 +1189,20 @@ func (c *Core) Seal(token string) (retErr error) {
// just returning with an error and recommending a vault restart, which
// essentially does the same thing.
if c.standby {
c.logger.Printf("[ERR] core: vault cannot be sealed when in standby mode; please restart instead")
return errors.New("vault cannot be sealed when in standby mode; please restart instead")
c.logger.Printf("[ERR] core: vault cannot seal when in standby mode; please restart instead")
return errors.New("vault cannot seal when in standby mode; please restart instead")
}
return err
}
// Attempt to use the token (decrement num_uses)
// If we can't, we still continue attempting the seal, so long as the token
// has appropriate permissions
if te != nil {
if err := c.tokenStore.UseToken(te); err != nil {
c.logger.Printf("[ERR] core: failed to use token: %v", err)
retErr = ErrInternalError
}
}
// Verify that this operation is allowed
allowed, rootPrivs := acl.AllowOperation(req.Operation, req.Path)
@ -1206,7 +1215,7 @@ func (c *Core) Seal(token string) (retErr error) {
return logical.ErrPermissionDenied
}
// Seal the Vault
//Seal the Vault
err = c.sealInternal()
if err == nil && retErr == ErrInternalError {
c.logger.Printf("[ERR] core: core is successfully sealed but another error occurred during the operation")
@ -1217,9 +1226,60 @@ func (c *Core) Seal(token string) (retErr error) {
return
}
// sealInternal is an internal method used to seal the vault.
// It does not do any authorization checking. The stateLock must
// be held prior to calling.
// StepDown is used to step down from leadership
func (c *Core) StepDown(token string) error {
defer metrics.MeasureSince([]string{"core", "step_down"}, time.Now())
c.stateLock.Lock()
defer c.stateLock.Unlock()
if c.sealed {
return nil
}
if c.ha == nil || c.standby {
return nil
}
// Validate the token is a root token
req := &logical.Request{
Operation: logical.UpdateOperation,
Path: "sys/step-down",
ClientToken: token,
}
acl, te, err := c.fetchACLandTokenEntry(req)
if err != nil {
return err
}
// Attempt to use the token (decrement num_uses)
if te != nil {
if err := c.tokenStore.UseToken(te); err != nil {
c.logger.Printf("[ERR] core: failed to use token: %v", err)
return err
}
}
// Verify that this operation is allowed
allowed, rootPrivs := acl.AllowOperation(req.Operation, req.Path)
if !allowed {
return logical.ErrPermissionDenied
}
// We always require root privileges for this operation
if !rootPrivs {
return logical.ErrPermissionDenied
}
select {
case c.manualStepDownCh <- struct{}{}:
default:
c.logger.Printf("[WARN] core: manual step-down operation already queued")
}
return nil
}
// sealInternal is an internal method used to seal the vault. It does not do
// any authorization checking. The stateLock must be held prior to calling.
func (c *Core) sealInternal() error {
// Enable that we are sealed to prevent furthur transactions
c.sealed = true
@ -1244,6 +1304,7 @@ func (c *Core) sealInternal() error {
return err
}
c.logger.Printf("[INFO] core: vault is sealed")
return nil
}
@ -1353,8 +1414,9 @@ func (c *Core) preSeal() error {
// runStandby is a long running routine that is used when an HA backend
// is enabled. It waits until we are leader and switches this Vault to
// active.
func (c *Core) runStandby(doneCh, stopCh chan struct{}) {
func (c *Core) runStandby(doneCh, stopCh, manualStepDownCh chan struct{}) {
defer close(doneCh)
defer close(manualStepDownCh)
c.logger.Printf("[INFO] core: entering standby mode")
// Monitor for key rotation
@ -1418,11 +1480,15 @@ func (c *Core) runStandby(doneCh, stopCh chan struct{}) {
}
// Monitor a loss of leadership
var manualStepDown bool
select {
case <-leaderLostCh:
c.logger.Printf("[WARN] core: leadership lost, stopping active operation")
case <-stopCh:
c.logger.Printf("[WARN] core: stopping active operation")
case <-manualStepDownCh:
c.logger.Printf("[WARN] core: stepping down from active operation to standby")
manualStepDown = true
}
// Clear ourself as leader
@ -1443,6 +1509,12 @@ func (c *Core) runStandby(doneCh, stopCh chan struct{}) {
if preSealErr != nil {
c.logger.Printf("[ERR] core: pre-seal teardown failed: %v", err)
}
// If we've merely stepped down, we could instantly grab the lock
// again. Give the other nodes a chance.
if manualStepDown {
time.Sleep(manualStepDownSleepPeriod)
}
}
}

View File

@ -1106,9 +1106,6 @@ func TestCore_Standby_Seal(t *testing.T) {
// Wait for core to become active
testWaitActive(t, core)
// Ensure that the original clean function has stopped running
time.Sleep(2 * time.Second)
// Check the leader is local
isLeader, advertise, err := core.Leader()
if err != nil {
@ -1183,6 +1180,180 @@ func TestCore_Standby_Seal(t *testing.T) {
}
}
func TestCore_StepDown(t *testing.T) {
// Create the first core and initialize it
inm := physical.NewInmem()
inmha := physical.NewInmemHA()
advertiseOriginal := "http://127.0.0.1:8200"
core, err := NewCore(&CoreConfig{
Physical: inm,
HAPhysical: inmha,
AdvertiseAddr: advertiseOriginal,
DisableMlock: true,
})
if err != nil {
t.Fatalf("err: %v", err)
}
key, root := TestCoreInit(t, core)
if _, err := core.Unseal(TestKeyCopy(key)); err != nil {
t.Fatalf("unseal err: %s", err)
}
// Verify unsealed
sealed, err := core.Sealed()
if err != nil {
t.Fatalf("err checking seal status: %s", err)
}
if sealed {
t.Fatal("should not be sealed")
}
// Wait for core to become active
testWaitActive(t, core)
// Check the leader is local
isLeader, advertise, err := core.Leader()
if err != nil {
t.Fatalf("err: %v", err)
}
if !isLeader {
t.Fatalf("should be leader")
}
if advertise != advertiseOriginal {
t.Fatalf("Bad advertise: %v", advertise)
}
// Create the second core and initialize it
advertiseOriginal2 := "http://127.0.0.1:8500"
core2, err := NewCore(&CoreConfig{
Physical: inm,
HAPhysical: inmha,
AdvertiseAddr: advertiseOriginal2,
DisableMlock: true,
})
if err != nil {
t.Fatalf("err: %v", err)
}
if _, err := core2.Unseal(TestKeyCopy(key)); err != nil {
t.Fatalf("unseal err: %s", err)
}
// Verify unsealed
sealed, err = core2.Sealed()
if err != nil {
t.Fatalf("err checking seal status: %s", err)
}
if sealed {
t.Fatal("should not be sealed")
}
// Core2 should be in standby
standby, err := core2.Standby()
if err != nil {
t.Fatalf("err: %v", err)
}
if !standby {
t.Fatalf("should be standby")
}
// Check the leader is not local
isLeader, advertise, err = core2.Leader()
if err != nil {
t.Fatalf("err: %v", err)
}
if isLeader {
t.Fatalf("should not be leader")
}
if advertise != advertiseOriginal {
t.Fatalf("Bad advertise: %v", advertise)
}
// Step down core
err = core.StepDown(root)
if err != nil {
t.Fatal("error stepping down core 1")
}
// Give time to switch leaders
time.Sleep(2 * time.Second)
// Core1 should be in standby
standby, err = core.Standby()
if err != nil {
t.Fatalf("err: %v", err)
}
if !standby {
t.Fatalf("should be standby")
}
// Check the leader is core2
isLeader, advertise, err = core2.Leader()
if err != nil {
t.Fatalf("err: %v", err)
}
if !isLeader {
t.Fatalf("should be leader")
}
if advertise != advertiseOriginal2 {
t.Fatalf("Bad advertise: %v", advertise)
}
// Check the leader is not local
isLeader, advertise, err = core.Leader()
if err != nil {
t.Fatalf("err: %v", err)
}
if isLeader {
t.Fatalf("should not be leader")
}
if advertise != advertiseOriginal2 {
t.Fatalf("Bad advertise: %v", advertise)
}
// Step down core2
err = core2.StepDown(root)
if err != nil {
t.Fatal("error stepping down core 1")
}
// Give time to switch leaders -- core 1 will still be waiting on its
// cooling off period so give it a full 10 seconds to recover
time.Sleep(10 * time.Second)
// Core2 should be in standby
standby, err = core2.Standby()
if err != nil {
t.Fatalf("err: %v", err)
}
if !standby {
t.Fatalf("should be standby")
}
// Check the leader is core1
isLeader, advertise, err = core.Leader()
if err != nil {
t.Fatalf("err: %v", err)
}
if !isLeader {
t.Fatalf("should be leader")
}
if advertise != advertiseOriginal {
t.Fatalf("Bad advertise: %v", advertise)
}
// Check the leader is not local
isLeader, advertise, err = core2.Leader()
if err != nil {
t.Fatalf("err: %v", err)
}
if isLeader {
t.Fatalf("should not be leader")
}
if advertise != advertiseOriginal {
t.Fatalf("Bad advertise: %v", advertise)
}
}
func TestCore_CleanLeaderPrefix(t *testing.T) {
// Create the first core and initialize it
inm := physical.NewInmem()

View File

@ -11,7 +11,9 @@ description: |-
<dl>
<dt>Description</dt>
<dd>
Seals the Vault. In HA mode, only an active node can be sealed. Standby nodes should be restarted to get the same effect.
Seals the Vault. In HA mode, only an active node can be sealed. Standby
nodes should be restarted to get the same effect. Requires a token with
`root` policy or `sudo` capability on the path.
</dd>
<dt>Method</dt>

View File

@ -0,0 +1,33 @@
---
layout: "http"
page_title: "HTTP API: /sys/step-down"
sidebar_current: "docs-http-ha-step-down"
description: |-
The '/sys/step-down' endpoint causes the node to give up active status.
---
# /sys/seal
<dl>
<dt>Description</dt>
<dd>
Forces the node to give up active status. If the node does not have active
status, this endpoint does nothing. Note that the node will sleep for ten
seconds before attempting to grab the active lock again, but if no standby
nodes grab the active lock in the interim, the same node may become the
active node again. Requires a token with `root` policy or `sudo` capability
on the path.
</dd>
<dt>Method</dt>
<dd>PUT</dd>
<dt>Parameters</dt>
<dd>
None
</dd>
<dt>Returns</dt>
<dd>A `204` response code.
</dd>
</dl>

View File

@ -107,6 +107,9 @@
<li<%= sidebar_current("docs-http-ha-leader") %>>
<a href="/docs/http/sys-leader.html">/sys/leader</a>
</li>
<li<%= sidebar_current("docs-http-ha-step-down") %>>
<a href="/docs/http/sys-step-down.html">/sys/step-down</a>
</li>
</ul>
</li>