90248c79b2
* Add grace period calculation logic to renewer * Update lease renewer logic. It is believed by myself and members of the Nomad team that this logic should be much more robust in terms of causing large numbers of new secret acquisitions caused by a static grace period. See comments in the code for details. Fixes #3414 * Fix some commenting and fix tests * Add more time to test so that integ tests don't time out * Fix some review feedback
358 lines
9.8 KiB
Go
358 lines
9.8 KiB
Go
package api
|
|
|
|
import (
|
|
"errors"
|
|
"math/rand"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
var (
|
|
ErrRenewerMissingInput = errors.New("missing input to renewer")
|
|
ErrRenewerMissingSecret = errors.New("missing secret to renew")
|
|
ErrRenewerNotRenewable = errors.New("secret is not renewable")
|
|
ErrRenewerNoSecretData = errors.New("returned empty secret data")
|
|
|
|
// DefaultRenewerRenewBuffer is the default size of the buffer for renew
|
|
// messages on the channel.
|
|
DefaultRenewerRenewBuffer = 5
|
|
)
|
|
|
|
// Renewer is a process for renewing a secret.
|
|
//
|
|
// renewer, err := client.NewRenewer(&RenewerInput{
|
|
// Secret: mySecret,
|
|
// })
|
|
// go renewer.Renew()
|
|
// defer renewer.Stop()
|
|
//
|
|
// for {
|
|
// select {
|
|
// case err := <-renewer.DoneCh():
|
|
// if err != nil {
|
|
// log.Fatal(err)
|
|
// }
|
|
//
|
|
// // Renewal is now over
|
|
// case renewal := <-renewer.RenewCh():
|
|
// log.Printf("Successfully renewed: %#v", renewal)
|
|
// }
|
|
// }
|
|
//
|
|
//
|
|
// The `DoneCh` will return if renewal fails or if the remaining lease duration
|
|
// after a renewal is less than or equal to the grace (in number of seconds). In
|
|
// both cases, the caller should attempt a re-read of the secret. Clients should
|
|
// check the return value of the channel to see if renewal was successful.
|
|
type Renewer struct {
|
|
l sync.Mutex
|
|
|
|
client *Client
|
|
secret *Secret
|
|
grace time.Duration
|
|
random *rand.Rand
|
|
increment int
|
|
doneCh chan error
|
|
renewCh chan *RenewOutput
|
|
|
|
stopped bool
|
|
stopCh chan struct{}
|
|
}
|
|
|
|
// RenewerInput is used as input to the renew function.
|
|
type RenewerInput struct {
|
|
// Secret is the secret to renew
|
|
Secret *Secret
|
|
|
|
// Grace is a minimum renewal before returning so the upstream client
|
|
// can do a re-read. This can be used to prevent clients from waiting
|
|
// too long to read a new credential and incur downtime.
|
|
Grace time.Duration
|
|
|
|
// Rand is the randomizer to use for underlying randomization. If not
|
|
// provided, one will be generated and seeded automatically. If provided, it
|
|
// is assumed to have already been seeded.
|
|
Rand *rand.Rand
|
|
|
|
// RenewBuffer is the size of the buffered channel where renew messages are
|
|
// dispatched.
|
|
RenewBuffer int
|
|
|
|
// The new TTL, in seconds, that should be set on the lease. The TTL set
|
|
// here may or may not be honored by the vault server, based on Vault
|
|
// configuration or any associated max TTL values.
|
|
Increment int
|
|
}
|
|
|
|
// RenewOutput is the metadata returned to the client (if it's listening) to
|
|
// renew messages.
|
|
type RenewOutput struct {
|
|
// RenewedAt is the timestamp when the renewal took place (UTC).
|
|
RenewedAt time.Time
|
|
|
|
// Secret is the underlying renewal data. It's the same struct as all data
|
|
// that is returned from Vault, but since this is renewal data, it will not
|
|
// usually include the secret itself.
|
|
Secret *Secret
|
|
}
|
|
|
|
// NewRenewer creates a new renewer from the given input.
|
|
func (c *Client) NewRenewer(i *RenewerInput) (*Renewer, error) {
|
|
if i == nil {
|
|
return nil, ErrRenewerMissingInput
|
|
}
|
|
|
|
secret := i.Secret
|
|
if secret == nil {
|
|
return nil, ErrRenewerMissingSecret
|
|
}
|
|
|
|
grace := i.Grace
|
|
|
|
random := i.Rand
|
|
if random == nil {
|
|
random = rand.New(rand.NewSource(int64(time.Now().Nanosecond())))
|
|
}
|
|
|
|
renewBuffer := i.RenewBuffer
|
|
if renewBuffer == 0 {
|
|
renewBuffer = DefaultRenewerRenewBuffer
|
|
}
|
|
|
|
return &Renewer{
|
|
client: c,
|
|
secret: secret,
|
|
grace: grace,
|
|
increment: i.Increment,
|
|
random: random,
|
|
doneCh: make(chan error, 1),
|
|
renewCh: make(chan *RenewOutput, renewBuffer),
|
|
|
|
stopped: false,
|
|
stopCh: make(chan struct{}),
|
|
}, nil
|
|
}
|
|
|
|
// DoneCh returns the channel where the renewer will publish when renewal stops.
|
|
// If there is an error, this will be an error.
|
|
func (r *Renewer) DoneCh() <-chan error {
|
|
return r.doneCh
|
|
}
|
|
|
|
// RenewCh is a channel that receives a message when a successful renewal takes
|
|
// place and includes metadata about the renewal.
|
|
func (r *Renewer) RenewCh() <-chan *RenewOutput {
|
|
return r.renewCh
|
|
}
|
|
|
|
// Stop stops the renewer.
|
|
func (r *Renewer) Stop() {
|
|
r.l.Lock()
|
|
if !r.stopped {
|
|
close(r.stopCh)
|
|
r.stopped = true
|
|
}
|
|
r.l.Unlock()
|
|
}
|
|
|
|
// Renew starts a background process for renewing this secret. When the secret
|
|
// has auth data, this attempts to renew the auth (token). When the secret has
|
|
// a lease, this attempts to renew the lease.
|
|
func (r *Renewer) Renew() {
|
|
var result error
|
|
if r.secret.Auth != nil {
|
|
result = r.renewAuth()
|
|
} else {
|
|
result = r.renewLease()
|
|
}
|
|
|
|
select {
|
|
case r.doneCh <- result:
|
|
case <-r.stopCh:
|
|
}
|
|
}
|
|
|
|
// renewAuth is a helper for renewing authentication.
|
|
func (r *Renewer) renewAuth() error {
|
|
if !r.secret.Auth.Renewable || r.secret.Auth.ClientToken == "" {
|
|
return ErrRenewerNotRenewable
|
|
}
|
|
|
|
priorDuration := time.Duration(r.secret.Auth.LeaseDuration) * time.Second
|
|
r.calculateGrace(priorDuration)
|
|
|
|
client, token := r.client, r.secret.Auth.ClientToken
|
|
|
|
for {
|
|
// Check if we are stopped.
|
|
select {
|
|
case <-r.stopCh:
|
|
return nil
|
|
default:
|
|
}
|
|
|
|
// Renew the auth.
|
|
renewal, err := client.Auth().Token().RenewTokenAsSelf(token, r.increment)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Push a message that a renewal took place.
|
|
select {
|
|
case r.renewCh <- &RenewOutput{time.Now().UTC(), renewal}:
|
|
default:
|
|
}
|
|
|
|
// Somehow, sometimes, this happens.
|
|
if renewal == nil || renewal.Auth == nil {
|
|
return ErrRenewerNoSecretData
|
|
}
|
|
|
|
// Do nothing if we are not renewable
|
|
if !renewal.Auth.Renewable {
|
|
return ErrRenewerNotRenewable
|
|
}
|
|
|
|
// Grab the lease duration
|
|
leaseDuration := time.Duration(renewal.Auth.LeaseDuration) * time.Second
|
|
|
|
// We keep evaluating a new grace period so long as the lease is
|
|
// extending. Once it stops extending, we've hit the max and need to
|
|
// rely on the grace duration.
|
|
if leaseDuration > priorDuration {
|
|
r.calculateGrace(leaseDuration)
|
|
}
|
|
priorDuration = leaseDuration
|
|
|
|
// The sleep duration is set to 2/3 of the current lease duration plus
|
|
// 1/3 of the current grace period, which adds jitter.
|
|
sleepDuration := time.Duration(float64(leaseDuration.Nanoseconds())*2/3 + float64(r.grace.Nanoseconds())/3)
|
|
|
|
// If we are within grace, return now; or, if the amount of time we
|
|
// would sleep would land us in the grace period. This helps with short
|
|
// tokens; for example, you don't want a current lease duration of 4
|
|
// seconds, a grace period of 3 seconds, and end up sleeping for more
|
|
// than three of those seconds and having a very small budget of time
|
|
// to renew.
|
|
if leaseDuration <= r.grace || leaseDuration-sleepDuration <= r.grace {
|
|
return nil
|
|
}
|
|
|
|
select {
|
|
case <-r.stopCh:
|
|
return nil
|
|
case <-time.After(sleepDuration):
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
|
|
// renewLease is a helper for renewing a lease.
|
|
func (r *Renewer) renewLease() error {
|
|
if !r.secret.Renewable || r.secret.LeaseID == "" {
|
|
return ErrRenewerNotRenewable
|
|
}
|
|
|
|
priorDuration := time.Duration(r.secret.LeaseDuration) * time.Second
|
|
r.calculateGrace(priorDuration)
|
|
|
|
client, leaseID := r.client, r.secret.LeaseID
|
|
|
|
for {
|
|
// Check if we are stopped.
|
|
select {
|
|
case <-r.stopCh:
|
|
return nil
|
|
default:
|
|
}
|
|
|
|
// Renew the lease.
|
|
renewal, err := client.Sys().Renew(leaseID, r.increment)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Push a message that a renewal took place.
|
|
select {
|
|
case r.renewCh <- &RenewOutput{time.Now().UTC(), renewal}:
|
|
default:
|
|
}
|
|
|
|
// Somehow, sometimes, this happens.
|
|
if renewal == nil {
|
|
return ErrRenewerNoSecretData
|
|
}
|
|
|
|
// Do nothing if we are not renewable
|
|
if !renewal.Renewable {
|
|
return ErrRenewerNotRenewable
|
|
}
|
|
|
|
// Grab the lease duration
|
|
leaseDuration := time.Duration(renewal.LeaseDuration) * time.Second
|
|
|
|
// We keep evaluating a new grace period so long as the lease is
|
|
// extending. Once it stops extending, we've hit the max and need to
|
|
// rely on the grace duration.
|
|
if leaseDuration > priorDuration {
|
|
r.calculateGrace(leaseDuration)
|
|
}
|
|
priorDuration = leaseDuration
|
|
|
|
// The sleep duration is set to 2/3 of the current lease duration plus
|
|
// 1/3 of the current grace period, which adds jitter.
|
|
sleepDuration := time.Duration(float64(leaseDuration.Nanoseconds())*2/3 + float64(r.grace.Nanoseconds())/3)
|
|
|
|
// If we are within grace, return now; or, if the amount of time we
|
|
// would sleep would land us in the grace period. This helps with short
|
|
// tokens; for example, you don't want a current lease duration of 4
|
|
// seconds, a grace period of 3 seconds, and end up sleeping for more
|
|
// than three of those seconds and having a very small budget of time
|
|
// to renew.
|
|
if leaseDuration <= r.grace || leaseDuration-sleepDuration <= r.grace {
|
|
return nil
|
|
}
|
|
|
|
select {
|
|
case <-r.stopCh:
|
|
return nil
|
|
case <-time.After(sleepDuration):
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
|
|
// sleepDuration calculates the time to sleep given the base lease duration. The
|
|
// base is the resulting lease duration. It will be reduced to 1/3 and
|
|
// multiplied by a random float between 0.0 and 1.0. This extra randomness
|
|
// prevents multiple clients from all trying to renew simultaneously.
|
|
func (r *Renewer) sleepDuration(base time.Duration) time.Duration {
|
|
sleep := float64(base)
|
|
|
|
// Renew at 1/3 the remaining lease. This will give us an opportunity to retry
|
|
// at least one more time should the first renewal fail.
|
|
sleep = sleep / 3.0
|
|
|
|
// Use a randomness so many clients do not hit Vault simultaneously.
|
|
sleep = sleep * (r.random.Float64() + 1) / 2.0
|
|
|
|
return time.Duration(sleep)
|
|
}
|
|
|
|
// calculateGrace calculates the grace period based on a reasonable set of
|
|
// assumptions given the total lease time; it also adds some jitter to not have
|
|
// clients be in sync.
|
|
func (r *Renewer) calculateGrace(leaseDuration time.Duration) {
|
|
if leaseDuration == 0 {
|
|
r.grace = 0
|
|
return
|
|
}
|
|
|
|
leaseNanos := float64(leaseDuration.Nanoseconds())
|
|
jitterMax := 0.1 * leaseNanos
|
|
|
|
// For a given lease duration, we want to allow 80-90% of that to elapse,
|
|
// so the remaining amount is the grace period
|
|
r.grace = time.Duration(jitterMax) + time.Duration(uint64(r.random.Int63())%uint64(jitterMax))
|
|
}
|