Fix revocation of leases when num_uses goes to 0 (#2190)

2016-12-16 13:11:55 -05:00 · 2016-12-16 13:11:55 -05:00 · f6044764c0
parent 99eff7b2b4
commit f6044764c0
9 changed files with 397 additions and 50 deletions
--- a/vault/expiration.go
+++ b/vault/expiration.go
@ -214,9 +214,11 @@ func (m *ExpirationManager) revokeCommon(leaseID string, force, skipToken bool)
 		return err
 	}

-	// Delete the secondary index
-	if err := m.removeIndexByToken(le.ClientToken, le.LeaseID); err != nil {
-		return err
+	// Delete the secondary index, but only if it's a leased secret (not auth)
+	if le.Secret != nil {
+		if err := m.removeIndexByToken(le.ClientToken, le.LeaseID); err != nil {
+			return err
+		}
 	}

 	// Clear the expiration handler
--- a/vault/expiration_test.go
+++ b/vault/expiration_test.go
@ -705,6 +705,14 @@ func TestExpiration_revokeEntry_token(t *testing.T) {
 		t.Fatalf("err: %v", err)
 	}

+	// N.B.: Vault doesn't allow both a secret and auth to be returned, but the
+	// reason for both is that auth needs to be included in order to use the
+	// token store as it's the only mounted backend, *but* RegisterAuth doesn't
+	// actually create the index by token, only Register (for a Secret) does.
+	// So without the Secret we don't do anything when removing the index which
+	// (at the time of writing) now fails because a bug causing every token
+	// expiration to do an extra delete to a non-existent key has been fixed,
+	// and this test relies on this nonstandard behavior.
 	le := &leaseEntry{
 		LeaseID: "foo/bar/1234",
 		Auth: &logical.Auth{
@ -713,6 +721,11 @@ func TestExpiration_revokeEntry_token(t *testing.T) {
 				TTL: time.Minute,
 			},
 		},
+		Secret: &logical.Secret{
+			LeaseOptions: logical.LeaseOptions{
+				TTL: time.Minute,
+			},
+		},
 		ClientToken: root.ID,
 		Path:        "foo/bar",
 		IssueTime:   time.Now(),
@ -725,7 +738,7 @@ func TestExpiration_revokeEntry_token(t *testing.T) {
 	if err := exp.createIndexByToken(le.ClientToken, le.LeaseID); err != nil {
 		t.Fatalf("error creating secondary index: %v", err)
 	}
-	exp.updatePending(le, le.Auth.LeaseTotal())
+	exp.updatePending(le, le.Secret.LeaseTotal())

 	indexEntry, err := exp.indexByToken(le.ClientToken, le.LeaseID)
 	if err != nil {
--- a/vault/token_store.go
+++ b/vault/token_store.go
@ -40,6 +40,20 @@ const (

 	// rolesPrefix is the prefix used to store role information
 	rolesPrefix = "roles/"
+
+	// tokenRevocationDeferred indicates that the token should not be used
+	// again but is currently fulfilling its final use
+	tokenRevocationDeferred = -1
+
+	// tokenRevocationInProgress indicates that revocation of that token/its
+	// leases is ongoing
+	tokenRevocationInProgress = -2
+
+	// tokenRevocationFailed indicates that revocation failed; the entry is
+	// kept around so that when the cleanup function is run it can be tried
+	// again (or when the revocation function is run again), but all other uses
+	// will report the token invalid
+	tokenRevocationFailed = -3
 )

 var (
@ -48,6 +62,14 @@ var (

 	// pathSuffixSanitize is used to ensure a path suffix in a role is valid.
 	pathSuffixSanitize = regexp.MustCompile("\\w[\\w-.]+\\w")
+
+	destroyCubbyhole = func(ts *TokenStore, saltedID string) error {
+		if ts.cubbyholeBackend == nil {
+			// Should only ever happen in testing
+			return nil
+		}
+		return ts.cubbyholeBackend.revoke(salt.SaltID(ts.cubbyholeBackend.saltUUID, saltedID, salt.SHA1Hash))
+	}
 )

 // TokenStore is used to manage client tokens. Tokens are used for
@ -66,6 +88,8 @@ type TokenStore struct {
 	policyLookupFunc func(string) (*Policy, error)

 	tokenLocks map[string]*sync.RWMutex
+
+	cubbyholeDestroyer func(*TokenStore, string) error
 }

 // NewTokenStore is used to construct a token store that is
@ -76,7 +100,8 @@ func NewTokenStore(c *Core, config *logical.BackendConfig) (*TokenStore, error)

 	// Initialize the store
 	t := &TokenStore{
-		view: view,
+		view:               view,
+		cubbyholeDestroyer: destroyCubbyhole,
 	}

 	if c.policyStore != nil {
@ -460,7 +485,13 @@ type TokenEntry struct {
 	// Used for operators to be able to associate with the source
 	DisplayName string `json:"display_name" mapstructure:"display_name" structs:"display_name"`

-	// Used to restrict the number of uses (zero is unlimited). This is to support one-time-tokens (generalized).
+	// Used to restrict the number of uses (zero is unlimited). This is to
+	// support one-time-tokens (generalized). There are a few special values:
+	// if it's -1 it has run through its use counts and is executing its final
+	// use; if it's -2 it is tainted, which means revocation is currently
+	// running on it; and if it's -3 it's also tainted but revocation
+	// previously ran and failed, so this hints the cleanup function to try it
+	// again.
 	NumUses int `json:"num_uses" mapstructure:"num_uses" structs:"num_uses"`

 	// Time of token creation
@ -722,7 +753,7 @@ func (ts *TokenStore) UseToken(te *TokenEntry) (*TokenEntry, error) {
 	defer lock.Unlock()

 	// Call lookupSalted instead of Lookup to avoid deadlocking since Lookup grabs a read lock
-	te, err := ts.lookupSalted(ts.SaltID(te.ID))
+	te, err := ts.lookupSalted(ts.SaltID(te.ID), false)
 	if err != nil {
 		return nil, fmt.Errorf("failed to refresh entry: %v", err)
 	}
@ -746,18 +777,9 @@ func (ts *TokenStore) UseToken(te *TokenEntry) (*TokenEntry, error) {
 		te.NumUses -= 1
 	}

-	// Marshal the entry
-	enc, err := json.Marshal(te)
+	err = ts.storeCommon(te, false)
 	if err != nil {
-		return nil, fmt.Errorf("failed to encode entry: %v", err)
-	}
-
-	// Write under the primary ID
-	saltedId := ts.SaltID(te.ID)
-	path := lookupPrefix + saltedId
-	le := &logical.StorageEntry{Key: path, Value: enc}
-	if err := ts.view.Put(le); err != nil {
-		return nil, fmt.Errorf("failed to persist entry: %v", err)
+		return nil, err
 	}

 	return te, nil
@ -783,11 +805,13 @@ func (ts *TokenStore) Lookup(id string) (*TokenEntry, error) {
 	lock.RLock()
 	defer lock.RUnlock()

-	return ts.lookupSalted(ts.SaltID(id))
+	return ts.lookupSalted(ts.SaltID(id), false)
 }

-// lookupSlated is used to find a token given its salted ID
-func (ts *TokenStore) lookupSalted(saltedId string) (*TokenEntry, error) {
+// lookupSalted is used to find a token given its salted ID. If tainted is
+// true, entries that are in some revocation state (currently, indicated by num
+// uses < 0), the entry will be returned anyways
+func (ts *TokenStore) lookupSalted(saltedId string, tainted bool) (*TokenEntry, error) {
 	// Lookup token
 	path := lookupPrefix + saltedId
 	raw, err := ts.view.Get(path)
@ -806,8 +830,8 @@ func (ts *TokenStore) lookupSalted(saltedId string) (*TokenEntry, error) {
 		return nil, fmt.Errorf("failed to decode entry: %v", err)
 	}

-	// This is a token that is awaiting deferred revocation
-	if entry.NumUses == -1 {
+	// This is a token that is awaiting deferred revocation or tainted
+	if entry.NumUses < 0 && !tainted {
 		return nil, nil
 	}

@ -869,46 +893,109 @@ func (ts *TokenStore) Revoke(id string) error {

 // revokeSalted is used to invalidate a given salted token,
 // any child tokens will be orphaned.
-func (ts *TokenStore) revokeSalted(saltedId string) error {
+func (ts *TokenStore) revokeSalted(saltedId string) (ret error) {
+	// Protect the entry lookup/writing with locks. The rub here is that we
+	// don't know the ID until we look it up once, so first we look it up, then
+	// do a locked lookup.
+	entry, err := ts.lookupSalted(saltedId, true)
+	if err != nil {
+		return err
+	}
+	if entry == nil {
+		return nil
+	}
+
+	lock := ts.getTokenLock(entry.ID)
+	lock.Lock()
+
 	// Lookup the token first
-	entry, err := ts.lookupSalted(saltedId)
+	entry, err = ts.lookupSalted(saltedId, true)
+	if err != nil {
+		lock.Unlock()
+		return err
+	}
+
+	if entry == nil {
+		lock.Unlock()
+		return nil
+	}
+
+	// On failure we write -3, so if we hit -2 here we're already running a
+	// revocation operation. This can happen due to e.g. recursion into this
+	// function via the expiration manager's RevokeByToken.
+	if entry.NumUses == tokenRevocationInProgress {
+		lock.Unlock()
+		return nil
+	}
+
+	// This acts as a WAL. lookupSalted will no longer return this entry,
+	// so the token cannot be used, but this way we can keep the entry
+	// around until after the rest of this function is attempted, and a
+	// cleanup function can key off of this value to try again.
+	entry.NumUses = tokenRevocationInProgress
+	err = ts.storeCommon(entry, false)
+	lock.Unlock()
 	if err != nil {
 		return err
 	}

-	// Nuke the primary key first
-	path := lookupPrefix + saltedId
-	if ts.view.Delete(path); err != nil {
-		return fmt.Errorf("failed to delete entry: %v", err)
+	// If we are returning an error, mark the entry with -3 to indicate
+	// failed revocation. This way we don't try to clean up during active
+	// revocation (-2).
+	defer func() {
+		if ret != nil {
+			lock.Lock()
+			defer lock.Unlock()
+
+			// Lookup the token again to make sure something else didn't
+			// revoke in the interim
+			entry, err := ts.lookupSalted(saltedId, true)
+			if err != nil {
+				return
+			}
+
+			// If it exists just taint to -3 rather than trying to figure
+			// out what it means if it's already -3 after the -2 above
+			if entry != nil {
+				entry.NumUses = tokenRevocationFailed
+				ts.storeCommon(entry, false)
+			}
+		}
+	}()
+
+	// Destroy the token's cubby. This should go first as it's a
+	// security-sensitive item.
+	err = ts.cubbyholeDestroyer(ts, saltedId)
+	if err != nil {
+		return err
+	}
+
+	// Revoke all secrets under this token. This should go first as it's a
+	// security-sensitive item.
+	if err := ts.expiration.RevokeByToken(entry); err != nil {
+		return err
 	}

 	// Clear the secondary index if any
-	if entry != nil && entry.Parent != "" {
+	if entry.Parent != "" {
 		path := parentPrefix + ts.SaltID(entry.Parent) + "/" + saltedId
-		if ts.view.Delete(path); err != nil {
+		if err = ts.view.Delete(path); err != nil {
 			return fmt.Errorf("failed to delete entry: %v", err)
 		}
 	}

 	// Clear the accessor index if any
-	if entry != nil && entry.Accessor != "" {
+	if entry.Accessor != "" {
 		path := accessorPrefix + ts.SaltID(entry.Accessor)
-		if ts.view.Delete(path); err != nil {
+		if err = ts.view.Delete(path); err != nil {
 			return fmt.Errorf("failed to delete entry: %v", err)
 		}
 	}

-	// Revoke all secrets under this token
-	if entry != nil {
-		if err := ts.expiration.RevokeByToken(entry); err != nil {
-			return err
-		}
-	}
-
-	// Destroy the cubby space
-	err = ts.destroyCubbyhole(saltedId)
-	if err != nil {
-		return err
+	// Now that the entry is not usable for any revocation tasks, nuke it
+	path := lookupPrefix + saltedId
+	if err = ts.view.Delete(path); err != nil {
+		return fmt.Errorf("failed to delete entry: %v", err)
 	}

 	return nil
@ -993,7 +1080,7 @@ func (ts *TokenStore) lookupBySaltedAccessor(saltedAccessor string) (accessorEnt
 	// If we hit an error, assume it's a pre-struct straight token ID
 	if err != nil {
 		aEntry.TokenID = string(entry.Value)
-		te, err := ts.lookupSalted(ts.SaltID(aEntry.TokenID))
+		te, err := ts.lookupSalted(ts.SaltID(aEntry.TokenID), false)
 		if err != nil {
 			return accessorEntry{}, fmt.Errorf("failed to look up token using accessor index: %s", err)
 		}
@ -1637,8 +1724,13 @@ func (ts *TokenStore) handleLookup(
 		return logical.ErrorResponse("missing token ID"), logical.ErrInvalidRequest
 	}

+	lock := ts.getTokenLock(id)
+	lock.RLock()
+	defer lock.RUnlock()
+
 	// Lookup the token
-	out, err := ts.Lookup(id)
+	saltedId := ts.SaltID(id)
+	out, err := ts.lookupSalted(saltedId, true)

 	if err != nil {
 		return logical.ErrorResponse(err.Error()), logical.ErrInvalidRequest
--- a/vault/token_store_test.go
+++ b/vault/token_store_test.go
@ -5,6 +5,7 @@ import (
 	"fmt"
 	"reflect"
 	"strings"
+	"sync"
 	"testing"
 	"time"

@ -547,7 +548,7 @@ func TestTokenStore_UseToken(t *testing.T) {
 	if te == nil {
 		t.Fatalf("token entry for use #2 was nil")
 	}
-	if te.NumUses != -1 {
+	if te.NumUses != tokenRevocationDeferred {
 		t.Fatalf("token entry after use #2 did not have revoke flag")
 	}
 	ts.Revoke(te.ID)
@ -2987,3 +2988,163 @@ func TestTokenStore_AllowedDisallowedPolicies(t *testing.T) {
 		t.Fatalf("expected an error")
 	}
 }
+
+// Issue 2189
+func TestTokenStore_RevokeUseCountToken(t *testing.T) {
+	var resp *logical.Response
+	var err error
+	cubbyFuncLock := &sync.RWMutex{}
+	cubbyFuncLock.Lock()
+
+	exp := mockExpiration(t)
+	ts := exp.tokenStore
+	root, _ := exp.tokenStore.rootToken()
+
+	tokenReq := &logical.Request{
+		Path:        "create",
+		ClientToken: root.ID,
+		Operation:   logical.UpdateOperation,
+		Data: map[string]interface{}{
+			"num_uses": 1,
+		},
+	}
+	resp, err = ts.HandleRequest(tokenReq)
+	if err != nil || (resp != nil && resp.IsError()) {
+		t.Fatalf("err:%v resp:%v", err, resp)
+	}
+
+	tut := resp.Auth.ClientToken
+	saltTut := ts.SaltID(tut)
+	te, err := ts.lookupSalted(saltTut, false)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if te == nil {
+		t.Fatal("nil entry")
+	}
+	if te.NumUses != 1 {
+		t.Fatalf("bad: %d", te.NumUses)
+	}
+
+	te, err = ts.UseToken(te)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if te == nil {
+		t.Fatal("nil entry")
+	}
+	if te.NumUses != tokenRevocationDeferred {
+		t.Fatalf("bad: %d", te.NumUses)
+	}
+
+	// Should return no entry because it's tainted
+	te, err = ts.lookupSalted(saltTut, false)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if te != nil {
+		t.Fatalf("%#v", te)
+	}
+
+	// But it should show up in an API lookup call
+	req := &logical.Request{
+		Path:        "lookup-self",
+		ClientToken: tut,
+		Operation:   logical.UpdateOperation,
+	}
+	resp, err = ts.HandleRequest(req)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if resp == nil || resp.Data == nil || resp.Data["num_uses"] == nil {
+		t.Fatal("nil resp or data")
+	}
+	if resp.Data["num_uses"].(int) != -1 {
+		t.Fatalf("bad: %v", resp.Data["num_uses"])
+	}
+
+	// Should return tainted entries
+	te, err = ts.lookupSalted(saltTut, true)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if te == nil {
+		t.Fatal("nil entry")
+	}
+	if te.NumUses != tokenRevocationDeferred {
+		t.Fatalf("bad: %d", te.NumUses)
+	}
+
+	origDestroyCubbyhole := ts.cubbyholeDestroyer
+
+	ts.cubbyholeDestroyer = func(*TokenStore, string) error {
+		return fmt.Errorf("keep it frosty")
+	}
+
+	err = ts.revokeSalted(saltTut)
+	if err == nil {
+		t.Fatalf("expected err")
+	}
+
+	// Since revocation failed we should see the tokenRevocationFailed canary value
+	te, err = ts.lookupSalted(saltTut, true)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if te == nil {
+		t.Fatal("nil entry")
+	}
+	if te.NumUses != tokenRevocationFailed {
+		t.Fatalf("bad: %d", te.NumUses)
+	}
+
+	// Check the race condition situation by making the process sleep
+	ts.cubbyholeDestroyer = func(*TokenStore, string) error {
+		time.Sleep(1 * time.Second)
+		return fmt.Errorf("keep it frosty")
+	}
+	cubbyFuncLock.Unlock()
+
+	go func() {
+		cubbyFuncLock.RLock()
+		err := ts.revokeSalted(saltTut)
+		cubbyFuncLock.RUnlock()
+		if err == nil {
+			t.Fatalf("expected error")
+		}
+	}()
+
+	// Give time for the function to start and grab locks
+	time.Sleep(200 * time.Millisecond)
+	te, err = ts.lookupSalted(saltTut, true)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if te == nil {
+		t.Fatal("nil entry")
+	}
+	if te.NumUses != tokenRevocationInProgress {
+		t.Fatalf("bad: %d", te.NumUses)
+	}
+
+	// Let things catch up
+	time.Sleep(2 * time.Second)
+
+	// Put back to normal
+	cubbyFuncLock.Lock()
+	defer cubbyFuncLock.Unlock()
+	ts.cubbyholeDestroyer = origDestroyCubbyhole
+
+	err = ts.revokeSalted(saltTut)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	te, err = ts.lookupSalted(saltTut, true)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if te != nil {
+		t.Fatal("found entry")
+	}
+}
--- a/version/version_base.go
+++ b/version/version_base.go
@ -9,5 +9,5 @@ func init() {
 	// A pre-release marker for the version. If this is "" (empty string)
 	// then it means that it is a final release. Otherwise, this is a pre-release
 	// such as "dev" (in development), "beta", "rc1", etc.
-	VersionPrerelease = "dev"
+	VersionPrerelease = ""
 }
--- a/website/config.rb
+++ b/website/config.rb
@ -2,7 +2,7 @@ set :base_url, "https://www.vaultproject.io/"

 activate :hashicorp do |h|
  h.name         = "vault"
-  h.version      = "0.6.3"
+  h.version      = "0.6.4"
  h.github_slug  = "hashicorp/vault"
  h.website_root = "website"
 end
--- a/website/source/docs/install/upgrade-to-0.6.3.html.md
+++ b/website/source/docs/install/upgrade-to-0.6.3.html.md
@ -3,7 +3,7 @@ layout: "install"
 page_title: "Upgrading to Vault 0.6.3"
 sidebar_current: "docs-install-upgrade-to-0.6.3"
 description: |-
-  Learn how to upgrade to Vault 0.63.
+  Learn how to upgrade to Vault 0.6.3.
 ---

 # Overview
--- a/website/source/docs/install/upgrade-to-0.6.4.html.md
+++ b/website/source/docs/install/upgrade-to-0.6.4.html.md
@ -0,0 +1,76 @@
+---
+layout: "install"
+page_title: "Upgrading to Vault 0.6.4"
+sidebar_current: "docs-install-upgrade-to-0.6.4"
+description: |-
+  Learn how to upgrade to Vault 0.6.4.
+---
+
+# Overview
+
+This page contains the list of deprecations and important or breaking changes
+for Vault 0.6.4. Please read it carefully.
+
+## Changes to `default` Policy Addition Rules for `auth/token/create`
+
+An issue was reported indicating that in some cases, the stated behavior of the
+`default` policy being added to tokens unless specifically requested could lead
+to a privilege escalation scenario if the parent token did not itself have the
+`default` policy. In most cases (e.g. using the default `default` policy) the
+escalated privileges are likely to not be dangerous, but this behavior is
+undesired.
+
+As such, we have modified the rules around `default` in a few ways when
+creating tokens using this endpoint:
+
+1. If a token creation request does not specify desired policies, the parent's
+   policies will be used (as normal) but `default` will not be automatically
+   added, so the child token will only have `default` if the parent does
+2. If a token creation request specifies desired policies, `default` will be
+   added automatically but only if the parent token has `default`
+3. If the token has `sudo` capability on the token creation endpoint, `default`
+   is always added; since `sudo` capability on this endpoint allows adding any
+   policy, this is not privilege escalation
+4. When using token store roles, `default` is always added to the list of child
+   token policies unless `default` is contained in `disallowed_policies`
+
+**Regardless of provenance**, if a token being created has the `default` policy
+and `no_default_policy` has been set true for the request, the `default` policy
+will be stripped from the final set of policies.
+
+## Many Users Should Run `auth/token/cleanup`
+
+While investigating a report of accessor entries not being removed correctly
+from Vault's data store, a security issue was discovered: if limited use-count
+tokens are being used and expire due to the use-count dropping to zero (rather
+than due to the token's TTL expiring), several pieces of Vault's token
+revocation logic would not be properly run. This included cleaning up the
+associated accessor entry from the data store, but more importantly, included
+the logic used to immediately expire leases associated with the token.
+
+These leases would still be expired when their TTL ran out rather than live
+forever, which limits the severity of this issue, but it is still a potentially
+serious bug depending on each particular setup.
+
+To mitigate this issue we have taken the following steps:
+
+1. Updated the token revocation logic to be more resilient to failures that can
+   happen as it performs its various steps; the revocation will now properly
+   only be considered successful if no error has occurred, and the token entry
+   will remain in the data store (but not be usable for performing Vault
+   functions) until all aspects of revocation report success
+
+2. Added a function `auth/token/cleanup` that can both clean up the leaked
+   accessor entries and expire the leases associated with those accessors'
+   tokens.
+
+In most cases, running `auth/token/cleanup` should expire all outstanding
+leases that were generated by these revoked limited-use-count tokens.
+
+If you are both using limited-use-count tokens *and* you are using them to
+issue leased secrets, you should consider upgrading to 0.6.4 and running the
+`auth/token/cleanup` endpoint immediately.
+
+Note: response wrapping tokens do not allow generating leases but have still
+been subject to leaking the accessor entries, so if you have been a heavy user
+of response wrapping you should still consider running the cleanup function.
--- a/website/source/layouts/install.erb
+++ b/website/source/layouts/install.erb
@ -37,6 +37,9 @@
 						<li<%= sidebar_current("docs-install-upgrade-to-0.6.3") %>>
 							<a href="/docs/install/upgrade-to-0.6.3.html">Upgrade to 0.6.3</a>
 						</li>
+						<li<%= sidebar_current("docs-install-upgrade-to-0.6.4") %>>
+							<a href="/docs/install/upgrade-to-0.6.4.html">Upgrade to 0.6.4</a>
+						</li>
 					</ul>
 				</li>
 			</ul>