open-vault/api/renewer.go
Jeff Mitchell 90248c79b2
Update lease renewer logic (#4090)
* Add grace period calculation logic to renewer

* Update lease renewer logic.

It is believed by myself and members of the Nomad team that this logic
should be much more robust in terms of causing large numbers of new
secret acquisitions caused by a static grace period. See comments in the
code for details.

Fixes #3414

* Fix some commenting and fix tests

* Add more time to test so that integ tests don't time out

* Fix some review feedback
2018-03-19 15:48:24 -04:00

358 lines
9.8 KiB
Go

package api
import (
"errors"
"math/rand"
"sync"
"time"
)
var (
ErrRenewerMissingInput = errors.New("missing input to renewer")
ErrRenewerMissingSecret = errors.New("missing secret to renew")
ErrRenewerNotRenewable = errors.New("secret is not renewable")
ErrRenewerNoSecretData = errors.New("returned empty secret data")
// DefaultRenewerRenewBuffer is the default size of the buffer for renew
// messages on the channel.
DefaultRenewerRenewBuffer = 5
)
// Renewer is a process for renewing a secret.
//
// renewer, err := client.NewRenewer(&RenewerInput{
// Secret: mySecret,
// })
// go renewer.Renew()
// defer renewer.Stop()
//
// for {
// select {
// case err := <-renewer.DoneCh():
// if err != nil {
// log.Fatal(err)
// }
//
// // Renewal is now over
// case renewal := <-renewer.RenewCh():
// log.Printf("Successfully renewed: %#v", renewal)
// }
// }
//
//
// The `DoneCh` will return if renewal fails or if the remaining lease duration
// after a renewal is less than or equal to the grace (in number of seconds). In
// both cases, the caller should attempt a re-read of the secret. Clients should
// check the return value of the channel to see if renewal was successful.
type Renewer struct {
l sync.Mutex
client *Client
secret *Secret
grace time.Duration
random *rand.Rand
increment int
doneCh chan error
renewCh chan *RenewOutput
stopped bool
stopCh chan struct{}
}
// RenewerInput is used as input to the renew function.
type RenewerInput struct {
// Secret is the secret to renew
Secret *Secret
// Grace is a minimum renewal before returning so the upstream client
// can do a re-read. This can be used to prevent clients from waiting
// too long to read a new credential and incur downtime.
Grace time.Duration
// Rand is the randomizer to use for underlying randomization. If not
// provided, one will be generated and seeded automatically. If provided, it
// is assumed to have already been seeded.
Rand *rand.Rand
// RenewBuffer is the size of the buffered channel where renew messages are
// dispatched.
RenewBuffer int
// The new TTL, in seconds, that should be set on the lease. The TTL set
// here may or may not be honored by the vault server, based on Vault
// configuration or any associated max TTL values.
Increment int
}
// RenewOutput is the metadata returned to the client (if it's listening) to
// renew messages.
type RenewOutput struct {
// RenewedAt is the timestamp when the renewal took place (UTC).
RenewedAt time.Time
// Secret is the underlying renewal data. It's the same struct as all data
// that is returned from Vault, but since this is renewal data, it will not
// usually include the secret itself.
Secret *Secret
}
// NewRenewer creates a new renewer from the given input.
func (c *Client) NewRenewer(i *RenewerInput) (*Renewer, error) {
if i == nil {
return nil, ErrRenewerMissingInput
}
secret := i.Secret
if secret == nil {
return nil, ErrRenewerMissingSecret
}
grace := i.Grace
random := i.Rand
if random == nil {
random = rand.New(rand.NewSource(int64(time.Now().Nanosecond())))
}
renewBuffer := i.RenewBuffer
if renewBuffer == 0 {
renewBuffer = DefaultRenewerRenewBuffer
}
return &Renewer{
client: c,
secret: secret,
grace: grace,
increment: i.Increment,
random: random,
doneCh: make(chan error, 1),
renewCh: make(chan *RenewOutput, renewBuffer),
stopped: false,
stopCh: make(chan struct{}),
}, nil
}
// DoneCh returns the channel where the renewer will publish when renewal stops.
// If there is an error, this will be an error.
func (r *Renewer) DoneCh() <-chan error {
return r.doneCh
}
// RenewCh is a channel that receives a message when a successful renewal takes
// place and includes metadata about the renewal.
func (r *Renewer) RenewCh() <-chan *RenewOutput {
return r.renewCh
}
// Stop stops the renewer.
func (r *Renewer) Stop() {
r.l.Lock()
if !r.stopped {
close(r.stopCh)
r.stopped = true
}
r.l.Unlock()
}
// Renew starts a background process for renewing this secret. When the secret
// has auth data, this attempts to renew the auth (token). When the secret has
// a lease, this attempts to renew the lease.
func (r *Renewer) Renew() {
var result error
if r.secret.Auth != nil {
result = r.renewAuth()
} else {
result = r.renewLease()
}
select {
case r.doneCh <- result:
case <-r.stopCh:
}
}
// renewAuth is a helper for renewing authentication.
func (r *Renewer) renewAuth() error {
if !r.secret.Auth.Renewable || r.secret.Auth.ClientToken == "" {
return ErrRenewerNotRenewable
}
priorDuration := time.Duration(r.secret.Auth.LeaseDuration) * time.Second
r.calculateGrace(priorDuration)
client, token := r.client, r.secret.Auth.ClientToken
for {
// Check if we are stopped.
select {
case <-r.stopCh:
return nil
default:
}
// Renew the auth.
renewal, err := client.Auth().Token().RenewTokenAsSelf(token, r.increment)
if err != nil {
return err
}
// Push a message that a renewal took place.
select {
case r.renewCh <- &RenewOutput{time.Now().UTC(), renewal}:
default:
}
// Somehow, sometimes, this happens.
if renewal == nil || renewal.Auth == nil {
return ErrRenewerNoSecretData
}
// Do nothing if we are not renewable
if !renewal.Auth.Renewable {
return ErrRenewerNotRenewable
}
// Grab the lease duration
leaseDuration := time.Duration(renewal.Auth.LeaseDuration) * time.Second
// We keep evaluating a new grace period so long as the lease is
// extending. Once it stops extending, we've hit the max and need to
// rely on the grace duration.
if leaseDuration > priorDuration {
r.calculateGrace(leaseDuration)
}
priorDuration = leaseDuration
// The sleep duration is set to 2/3 of the current lease duration plus
// 1/3 of the current grace period, which adds jitter.
sleepDuration := time.Duration(float64(leaseDuration.Nanoseconds())*2/3 + float64(r.grace.Nanoseconds())/3)
// If we are within grace, return now; or, if the amount of time we
// would sleep would land us in the grace period. This helps with short
// tokens; for example, you don't want a current lease duration of 4
// seconds, a grace period of 3 seconds, and end up sleeping for more
// than three of those seconds and having a very small budget of time
// to renew.
if leaseDuration <= r.grace || leaseDuration-sleepDuration <= r.grace {
return nil
}
select {
case <-r.stopCh:
return nil
case <-time.After(sleepDuration):
continue
}
}
}
// renewLease is a helper for renewing a lease.
func (r *Renewer) renewLease() error {
if !r.secret.Renewable || r.secret.LeaseID == "" {
return ErrRenewerNotRenewable
}
priorDuration := time.Duration(r.secret.LeaseDuration) * time.Second
r.calculateGrace(priorDuration)
client, leaseID := r.client, r.secret.LeaseID
for {
// Check if we are stopped.
select {
case <-r.stopCh:
return nil
default:
}
// Renew the lease.
renewal, err := client.Sys().Renew(leaseID, r.increment)
if err != nil {
return err
}
// Push a message that a renewal took place.
select {
case r.renewCh <- &RenewOutput{time.Now().UTC(), renewal}:
default:
}
// Somehow, sometimes, this happens.
if renewal == nil {
return ErrRenewerNoSecretData
}
// Do nothing if we are not renewable
if !renewal.Renewable {
return ErrRenewerNotRenewable
}
// Grab the lease duration
leaseDuration := time.Duration(renewal.LeaseDuration) * time.Second
// We keep evaluating a new grace period so long as the lease is
// extending. Once it stops extending, we've hit the max and need to
// rely on the grace duration.
if leaseDuration > priorDuration {
r.calculateGrace(leaseDuration)
}
priorDuration = leaseDuration
// The sleep duration is set to 2/3 of the current lease duration plus
// 1/3 of the current grace period, which adds jitter.
sleepDuration := time.Duration(float64(leaseDuration.Nanoseconds())*2/3 + float64(r.grace.Nanoseconds())/3)
// If we are within grace, return now; or, if the amount of time we
// would sleep would land us in the grace period. This helps with short
// tokens; for example, you don't want a current lease duration of 4
// seconds, a grace period of 3 seconds, and end up sleeping for more
// than three of those seconds and having a very small budget of time
// to renew.
if leaseDuration <= r.grace || leaseDuration-sleepDuration <= r.grace {
return nil
}
select {
case <-r.stopCh:
return nil
case <-time.After(sleepDuration):
continue
}
}
}
// sleepDuration calculates the time to sleep given the base lease duration. The
// base is the resulting lease duration. It will be reduced to 1/3 and
// multiplied by a random float between 0.0 and 1.0. This extra randomness
// prevents multiple clients from all trying to renew simultaneously.
func (r *Renewer) sleepDuration(base time.Duration) time.Duration {
sleep := float64(base)
// Renew at 1/3 the remaining lease. This will give us an opportunity to retry
// at least one more time should the first renewal fail.
sleep = sleep / 3.0
// Use a randomness so many clients do not hit Vault simultaneously.
sleep = sleep * (r.random.Float64() + 1) / 2.0
return time.Duration(sleep)
}
// calculateGrace calculates the grace period based on a reasonable set of
// assumptions given the total lease time; it also adds some jitter to not have
// clients be in sync.
func (r *Renewer) calculateGrace(leaseDuration time.Duration) {
if leaseDuration == 0 {
r.grace = 0
return
}
leaseNanos := float64(leaseDuration.Nanoseconds())
jitterMax := 0.1 * leaseNanos
// For a given lease duration, we want to allow 80-90% of that to elapse,
// so the remaining amount is the grace period
r.grace = time.Duration(jitterMax) + time.Duration(uint64(r.random.Int63())%uint64(jitterMax))
}