open-vault/builtin/logical/database/rotation.go
Hridoy Roy dbd178250e
Port: Premature Rotation For autorotate (#12563)
* port of ldap fix for early cred rotation

* some more porting

* another couple lines to port

* final commits before report

* remove deadlock

* needs testing

* updates

* Sync with OpenLDAP PR

* Update the update error handling for items not found in the queue

* WIP unit tests
* Need to configure DB mount correctly, with db type mockv5
* Need to find a way to inject errors into that mock db

* throw error on role creation failure

* do not swallow error on role creation

* comment out wip tests and add in a test for disallowed role

* Use newly generated password in WAL

Co-authored-by: Michael Golowka <72365+pcman312@users.noreply.github.com>

* return err on popFromRotationQueueByKey error; cleanup on setStaticAccount

* test: fix TestPlugin_lifecycle

* Uncomment and fix unit tests
* Use mock database plugin to inject errors
* Tidy test code to rely less on code internals where possible
* Some stronger test assertions

* Undo logging updates

* Add changelog

* Remove ticker and background threads from WAL tests

* Keep pre-existing API behaviour of allowing update static role to act as a create

* Switch test back to update operation

* Revert my revert, and fix some test bugs

* Fix TestBackend_StaticRole_LockRegression

* clean up defer on TestPlugin_lifecycle

* unwrap reqs on cleanup

* setStaticAccount: don't hold a write lock

* TestStoredWALsCorrectlyProcessed: set replication state to unknown

Co-authored-by: Tom Proctor <tomhjp@users.noreply.github.com>
Co-authored-by: Michael Golowka <72365+pcman312@users.noreply.github.com>
Co-authored-by: Calvin Leung Huang <1883212+calvn@users.noreply.github.com>
2021-09-21 17:45:04 -07:00

599 lines
20 KiB
Go

package database
import (
"context"
"errors"
"fmt"
"strconv"
"time"
"github.com/hashicorp/errwrap"
"github.com/hashicorp/go-secure-stdlib/strutil"
v5 "github.com/hashicorp/vault/sdk/database/dbplugin/v5"
"github.com/hashicorp/vault/sdk/framework"
"github.com/hashicorp/vault/sdk/helper/consts"
"github.com/hashicorp/vault/sdk/helper/locksutil"
"github.com/hashicorp/vault/sdk/logical"
"github.com/hashicorp/vault/sdk/queue"
)
const (
// Default interval to check the queue for items needing rotation
defaultQueueTickSeconds = 5
// Config key to set an alternate interval
queueTickIntervalKey = "rotation_queue_tick_interval"
// WAL storage key used for static account rotations
staticWALKey = "staticRotationKey"
)
// populateQueue loads the priority queue with existing static accounts. This
// occurs at initialization, after any WAL entries of failed or interrupted
// rotations have been processed. It lists the roles from storage and searches
// for any that have an associated static account, then adds them to the
// priority queue for rotations.
func (b *databaseBackend) populateQueue(ctx context.Context, s logical.Storage) {
log := b.Logger()
log.Info("populating role rotation queue")
// Build map of role name / wal entries
walMap, err := b.loadStaticWALs(ctx, s)
if err != nil {
log.Warn("unable to load rotation WALs", "error", err)
}
roles, err := s.List(ctx, databaseStaticRolePath)
if err != nil {
log.Warn("unable to list role for enqueueing", "error", err)
return
}
for _, roleName := range roles {
select {
case <-ctx.Done():
log.Info("rotation queue restore cancelled")
return
default:
}
role, err := b.StaticRole(ctx, s, roleName)
if err != nil {
log.Warn("unable to read static role", "error", err, "role", roleName)
continue
}
item := queue.Item{
Key: roleName,
Priority: role.StaticAccount.NextRotationTime().Unix(),
}
// Check if role name is in map
walEntry := walMap[roleName]
if walEntry != nil {
// Check walEntry last vault time
if walEntry.LastVaultRotation.IsZero() {
// A WAL's last Vault rotation can only ever be 0 for a role that
// was never successfully created. So we know this WAL couldn't
// have been created for this role we just retrieved from storage.
// i.e. it must be a hangover from a previous attempt at creating
// a role with the same name
log.Debug("deleting WAL with zero last rotation time", "WAL ID", walEntry.walID, "created", walEntry.walCreatedAt)
if err := framework.DeleteWAL(ctx, s, walEntry.walID); err != nil {
log.Warn("unable to delete zero-time WAL", "error", err, "WAL ID", walEntry.walID)
}
} else if walEntry.LastVaultRotation.Before(role.StaticAccount.LastVaultRotation) {
// WAL's last vault rotation record is older than the role's data, so
// delete and move on
log.Debug("deleting outdated WAL", "WAL ID", walEntry.walID, "created", walEntry.walCreatedAt)
if err := framework.DeleteWAL(ctx, s, walEntry.walID); err != nil {
log.Warn("unable to delete WAL", "error", err, "WAL ID", walEntry.walID)
}
} else {
log.Info("found WAL for role",
"role", item.Key,
"WAL ID", walEntry.walID)
item.Value = walEntry.walID
item.Priority = time.Now().Unix()
}
}
if err := b.pushItem(&item); err != nil {
log.Warn("unable to enqueue item", "error", err, "role", roleName)
}
}
}
// runTicker kicks off a periodic ticker that invoke the automatic credential
// rotation method at a determined interval. The default interval is 5 seconds.
func (b *databaseBackend) runTicker(ctx context.Context, queueTickInterval time.Duration, s logical.Storage) {
b.logger.Info("starting periodic ticker")
tick := time.NewTicker(queueTickInterval)
defer tick.Stop()
for {
select {
case <-tick.C:
b.rotateCredentials(ctx, s)
case <-ctx.Done():
b.logger.Info("stopping periodic ticker")
return
}
}
}
// setCredentialsWAL is used to store information in a WAL that can retry a
// credential setting or rotation in the event of partial failure.
type setCredentialsWAL struct {
NewPassword string `json:"new_password"`
RoleName string `json:"role_name"`
Username string `json:"username"`
LastVaultRotation time.Time `json:"last_vault_rotation"`
// Private fields which will not be included in json.Marshal/Unmarshal.
walID string
walCreatedAt int64 // Unix time at which the WAL was created.
}
// rotateCredentials sets a new password for a static account. This method is
// invoked in the runTicker method, which is in it's own go-routine, and invoked
// periodically (approximately every 5 seconds).
//
// This method loops through the priority queue, popping the highest priority
// item until it encounters the first item that does not yet need rotation,
// based on the current time.
func (b *databaseBackend) rotateCredentials(ctx context.Context, s logical.Storage) {
for b.rotateCredential(ctx, s) {
}
}
func (b *databaseBackend) rotateCredential(ctx context.Context, s logical.Storage) bool {
// Quit rotating credentials if shutdown has started
select {
case <-ctx.Done():
return false
default:
}
item, err := b.popFromRotationQueue()
if err != nil {
if err != queue.ErrEmpty {
b.logger.Error("error popping item from queue", "err", err)
}
return false
}
// Guard against possible nil item
if item == nil {
return false
}
// Grab the exclusive lock for this Role, to make sure we don't incur and
// writes during the rotation process
lock := locksutil.LockForKey(b.roleLocks, item.Key)
lock.Lock()
defer lock.Unlock()
// Validate the role still exists
role, err := b.StaticRole(ctx, s, item.Key)
if err != nil {
b.logger.Error("unable to load role", "role", item.Key, "error", err)
item.Priority = time.Now().Add(10 * time.Second).Unix()
if err := b.pushItem(item); err != nil {
b.logger.Error("unable to push item on to queue", "error", err)
}
return true
}
if role == nil {
b.logger.Warn("role not found", "role", item.Key, "error", err)
return true
}
// If "now" is less than the Item priority, then this item does not need to
// be rotated
if time.Now().Unix() < item.Priority {
if err := b.pushItem(item); err != nil {
b.logger.Error("unable to push item on to queue", "error", err)
}
// Break out of the for loop
return false
}
input := &setStaticAccountInput{
RoleName: item.Key,
Role: role,
}
// If there is a WAL entry related to this Role, the corresponding WAL ID
// should be stored in the Item's Value field.
if walID, ok := item.Value.(string); ok {
input.WALID = walID
}
resp, err := b.setStaticAccount(ctx, s, input)
if err != nil {
b.logger.Error("unable to rotate credentials in periodic function", "error", err)
// Increment the priority enough so that the next call to this method
// likely will not attempt to rotate it, as a back-off of sorts
item.Priority = time.Now().Add(10 * time.Second).Unix()
// Preserve the WALID if it was returned
if resp != nil && resp.WALID != "" {
item.Value = resp.WALID
}
if err := b.pushItem(item); err != nil {
b.logger.Error("unable to push item on to queue", "error", err)
}
// Go to next item
return true
}
// Clear any stored WAL ID as we must have successfully deleted our WAL to get here.
item.Value = ""
lvr := resp.RotationTime
if lvr.IsZero() {
lvr = time.Now()
}
// Update priority and push updated Item to the queue
nextRotation := lvr.Add(role.StaticAccount.RotationPeriod)
item.Priority = nextRotation.Unix()
if err := b.pushItem(item); err != nil {
b.logger.Warn("unable to push item on to queue", "error", err)
}
return true
}
// findStaticWAL loads a WAL entry by ID. If found, only return the WAL if it
// is of type staticWALKey, otherwise return nil
func (b *databaseBackend) findStaticWAL(ctx context.Context, s logical.Storage, id string) (*setCredentialsWAL, error) {
wal, err := framework.GetWAL(ctx, s, id)
if err != nil {
return nil, err
}
if wal == nil || wal.Kind != staticWALKey {
return nil, nil
}
data := wal.Data.(map[string]interface{})
walEntry := setCredentialsWAL{
walID: id,
walCreatedAt: wal.CreatedAt,
NewPassword: data["new_password"].(string),
RoleName: data["role_name"].(string),
Username: data["username"].(string),
}
lvr, err := time.Parse(time.RFC3339, data["last_vault_rotation"].(string))
if err != nil {
return nil, err
}
walEntry.LastVaultRotation = lvr
return &walEntry, nil
}
type setStaticAccountInput struct {
RoleName string
Role *roleEntry
WALID string
}
type setStaticAccountOutput struct {
RotationTime time.Time
// Optional return field, in the event WAL was created and not destroyed
// during the operation
WALID string
}
// setStaticAccount sets the password for a static account associated with a
// Role. This method does many things:
// - verifies role exists and is in the allowed roles list
// - loads an existing WAL entry if WALID input is given, otherwise creates a
// new WAL entry
// - gets a database connection
// - accepts an input password, otherwise generates a new one via gRPC to the
// database plugin
// - sets new password for the static account
// - uses WAL for ensuring passwords are not lost if storage to Vault fails
//
// This method does not perform any operations on the priority queue. Those
// tasks must be handled outside of this method.
func (b *databaseBackend) setStaticAccount(ctx context.Context, s logical.Storage, input *setStaticAccountInput) (*setStaticAccountOutput, error) {
if input == nil || input.Role == nil || input.RoleName == "" {
return nil, errors.New("input was empty when attempting to set credentials for static account")
}
// Re-use WAL ID if present, otherwise PUT a new WAL
output := &setStaticAccountOutput{WALID: input.WALID}
dbConfig, err := b.DatabaseConfig(ctx, s, input.Role.DBName)
if err != nil {
return output, err
}
if dbConfig == nil {
return output, errors.New("the config is currently unset")
}
// If role name isn't in the database's allowed roles, send back a
// permission denied.
if !strutil.StrListContains(dbConfig.AllowedRoles, "*") && !strutil.StrListContainsGlob(dbConfig.AllowedRoles, input.RoleName) {
return output, fmt.Errorf("%q is not an allowed role", input.RoleName)
}
// Get the Database object
dbi, err := b.GetConnection(ctx, s, input.Role.DBName)
if err != nil {
return output, err
}
dbi.RLock()
defer dbi.RUnlock()
// Use password from input if available. This happens if we're restoring from
// a WAL item or processing the rotation queue with an item that has a WAL
// associated with it
var newPassword string
if output.WALID != "" {
wal, err := b.findStaticWAL(ctx, s, output.WALID)
if err != nil {
return output, errwrap.Wrapf("error retrieving WAL entry: {{err}}", err)
}
switch {
case wal != nil && wal.NewPassword != "":
newPassword = wal.NewPassword
default:
if wal == nil {
b.Logger().Error("expected role to have WAL, but WAL not found in storage", "role", input.RoleName, "WAL ID", output.WALID)
} else {
b.Logger().Error("expected WAL to have a new password set, but empty", "role", input.RoleName, "WAL ID", output.WALID)
err = framework.DeleteWAL(ctx, s, output.WALID)
if err != nil {
b.Logger().Warn("failed to delete WAL with no new password", "error", err, "WAL ID", output.WALID)
}
}
// If there's anything wrong with the WAL in storage, we'll need
// to generate a fresh WAL and password
output.WALID = ""
}
}
if output.WALID == "" {
newPassword, err = dbi.database.GeneratePassword(ctx, b.System(), dbConfig.PasswordPolicy)
if err != nil {
return output, err
}
output.WALID, err = framework.PutWAL(ctx, s, staticWALKey, &setCredentialsWAL{
RoleName: input.RoleName,
Username: input.Role.StaticAccount.Username,
NewPassword: newPassword,
LastVaultRotation: input.Role.StaticAccount.LastVaultRotation,
})
if err != nil {
return output, fmt.Errorf("error writing WAL entry: %w", err)
}
b.Logger().Debug("writing WAL", "role", input.RoleName, "WAL ID", output.WALID)
}
updateReq := v5.UpdateUserRequest{
Username: input.Role.StaticAccount.Username,
Password: &v5.ChangePassword{
NewPassword: newPassword,
Statements: v5.Statements{
Commands: input.Role.Statements.Rotation,
},
},
}
_, err = dbi.database.UpdateUser(ctx, updateReq, false)
if err != nil {
b.CloseIfShutdown(dbi, err)
return output, fmt.Errorf("error setting credentials: %w", err)
}
// Store updated role information
// lvr is the known LastVaultRotation
lvr := time.Now()
input.Role.StaticAccount.LastVaultRotation = lvr
input.Role.StaticAccount.Password = newPassword
output.RotationTime = lvr
entry, err := logical.StorageEntryJSON(databaseStaticRolePath+input.RoleName, input.Role)
if err != nil {
return output, err
}
if err := s.Put(ctx, entry); err != nil {
return output, err
}
// Cleanup WAL after successfully rotating and pushing new item on to queue
if err := framework.DeleteWAL(ctx, s, output.WALID); err != nil {
b.Logger().Warn("error deleting WAL", "WAL ID", output.WALID, "error", err)
return output, err
}
b.Logger().Debug("deleted WAL", "WAL ID", output.WALID)
// The WAL has been deleted, return new setStaticAccountOutput without it
return &setStaticAccountOutput{RotationTime: lvr}, nil
}
// initQueue preforms the necessary checks and initializations needed to perform
// automatic credential rotation for roles associated with static accounts. This
// method verifies if a queue is needed (primary server or local mount), and if
// so initializes the queue and launches a go-routine to periodically invoke a
// method to preform the rotations.
//
// initQueue is invoked by the Factory method in a go-routine. The Factory does
// not wait for success or failure of it's tasks before continuing. This is to
// avoid blocking the mount process while loading and evaluating existing roles,
// etc.
func (b *databaseBackend) initQueue(ctx context.Context, conf *logical.BackendConfig, replicationState consts.ReplicationState) {
// Verify this mount is on the primary server, or is a local mount. If not, do
// not create a queue or launch a ticker. Both processing the WAL list and
// populating the queue are done sequentially and before launching a
// go-routine to run the periodic ticker.
if (conf.System.LocalMount() || !replicationState.HasState(consts.ReplicationPerformanceSecondary)) &&
!replicationState.HasState(consts.ReplicationDRSecondary) &&
!replicationState.HasState(consts.ReplicationPerformanceStandby) {
b.Logger().Info("initializing database rotation queue")
// Poll for a PutWAL call that does not return a "read-only storage" error.
// This ensures the startup phases of loading WAL entries from any possible
// failed rotations can complete without error when deleting from storage.
READONLY_LOOP:
for {
select {
case <-ctx.Done():
b.Logger().Info("queue initialization canceled")
return
default:
}
walID, err := framework.PutWAL(ctx, conf.StorageView, staticWALKey, &setCredentialsWAL{RoleName: "vault-readonlytest"})
if walID != "" {
defer framework.DeleteWAL(ctx, conf.StorageView, walID)
}
switch {
case err == nil:
break READONLY_LOOP
case err.Error() == logical.ErrSetupReadOnly.Error():
time.Sleep(10 * time.Millisecond)
default:
b.Logger().Error("deleting nil key resulted in error", "error", err)
return
}
}
// Load roles and populate queue with static accounts
b.populateQueue(ctx, conf.StorageView)
// Launch ticker
queueTickerInterval := defaultQueueTickSeconds * time.Second
if strVal, ok := conf.Config[queueTickIntervalKey]; ok {
newVal, err := strconv.Atoi(strVal)
if err == nil {
queueTickerInterval = time.Duration(newVal) * time.Second
} else {
b.Logger().Error("bad value for %q option: %q", queueTickIntervalKey, strVal)
}
}
go b.runTicker(ctx, queueTickerInterval, conf.StorageView)
}
}
// loadStaticWALs reads WAL entries and returns a map of roles and their
// setCredentialsWAL, if found.
func (b *databaseBackend) loadStaticWALs(ctx context.Context, s logical.Storage) (map[string]*setCredentialsWAL, error) {
keys, err := framework.ListWAL(ctx, s)
if err != nil {
return nil, err
}
if len(keys) == 0 {
b.Logger().Debug("no WAL entries found")
return nil, nil
}
walMap := make(map[string]*setCredentialsWAL)
// Loop through WAL keys and process any rotation ones
for _, walID := range keys {
walEntry, err := b.findStaticWAL(ctx, s, walID)
if err != nil {
b.Logger().Error("error loading static WAL", "id", walID, "error", err)
continue
}
if walEntry == nil {
continue
}
// Verify the static role still exists
roleName := walEntry.RoleName
role, err := b.StaticRole(ctx, s, roleName)
if err != nil {
b.Logger().Warn("unable to read static role", "error", err, "role", roleName)
continue
}
if role == nil || role.StaticAccount == nil {
b.Logger().Debug("deleting WAL with nil role or static account", "WAL ID", walEntry.walID)
if err := framework.DeleteWAL(ctx, s, walEntry.walID); err != nil {
b.Logger().Warn("unable to delete WAL", "error", err, "WAL ID", walEntry.walID)
}
continue
}
if existingWALEntry, exists := walMap[walEntry.RoleName]; exists {
b.Logger().Debug("multiple WALs detected for role", "role", walEntry.RoleName,
"loaded WAL ID", existingWALEntry.walID, "created at", existingWALEntry.walCreatedAt, "last vault rotation", existingWALEntry.LastVaultRotation,
"candidate WAL ID", walEntry.walID, "created at", walEntry.walCreatedAt, "last vault rotation", walEntry.LastVaultRotation)
if walEntry.walCreatedAt > existingWALEntry.walCreatedAt {
// If the existing WAL is older, delete it from storage and fall
// through to inserting our current WAL into the map.
b.Logger().Debug("deleting stale loaded WAL", "WAL ID", existingWALEntry.walID)
err = framework.DeleteWAL(ctx, s, existingWALEntry.walID)
if err != nil {
b.Logger().Warn("unable to delete loaded WAL", "error", err, "WAL ID", existingWALEntry.walID)
}
} else {
// If we already have a more recent WAL entry in the map, delete
// this one and continue onto the next WAL.
b.Logger().Debug("deleting stale candidate WAL", "WAL ID", walEntry.walID)
err = framework.DeleteWAL(ctx, s, walID)
if err != nil {
b.Logger().Warn("unable to delete candidate WAL", "error", err, "WAL ID", walEntry.walID)
}
continue
}
}
b.Logger().Debug("loaded WAL", "WAL ID", walID)
walMap[walEntry.RoleName] = walEntry
}
return walMap, nil
}
// pushItem wraps the internal queue's Push call, to make sure a queue is
// actually available. This is needed because both runTicker and initQueue
// operate in go-routines, and could be accessing the queue concurrently
func (b *databaseBackend) pushItem(item *queue.Item) error {
b.RLock()
unlockFunc := b.RUnlock
defer func() { unlockFunc() }()
if b.credRotationQueue != nil {
return b.credRotationQueue.Push(item)
}
b.Logger().Warn("no queue found during push item")
return nil
}
// popFromRotationQueue wraps the internal queue's Pop call, to make sure a queue is
// actually available. This is needed because both runTicker and initQueue
// operate in go-routines, and could be accessing the queue concurrently
func (b *databaseBackend) popFromRotationQueue() (*queue.Item, error) {
b.RLock()
defer b.RUnlock()
if b.credRotationQueue != nil {
return b.credRotationQueue.Pop()
}
return nil, queue.ErrEmpty
}
// popFromRotationQueueByKey wraps the internal queue's PopByKey call, to make sure a queue is
// actually available. This is needed because both runTicker and initQueue
// operate in go-routines, and could be accessing the queue concurrently
func (b *databaseBackend) popFromRotationQueueByKey(name string) (*queue.Item, error) {
b.RLock()
defer b.RUnlock()
if b.credRotationQueue != nil {
item, err := b.credRotationQueue.PopByKey(name)
if err != nil {
return nil, err
}
if item != nil {
return item, nil
}
}
return nil, queue.ErrEmpty
}