2015-03-12 19:44:22 +00:00
package vault
2015-03-13 17:55:54 +00:00
import (
2017-12-01 22:08:38 +00:00
"context"
2015-03-13 17:55:54 +00:00
"encoding/json"
2017-09-05 15:09:00 +00:00
"errors"
2015-03-13 17:55:54 +00:00
"fmt"
2021-02-19 04:20:01 +00:00
"math/rand"
2018-04-04 07:07:10 +00:00
"os"
2015-03-13 17:55:54 +00:00
"path"
2020-10-30 21:45:44 +00:00
"strconv"
2015-03-16 18:33:59 +00:00
"strings"
2015-03-13 18:31:43 +00:00
"sync"
2017-04-26 19:07:58 +00:00
"sync/atomic"
2015-03-13 17:55:54 +00:00
"time"
2015-03-15 20:52:43 +00:00
2020-07-01 19:41:42 +00:00
metrics "github.com/armon/go-metrics"
"github.com/hashicorp/errwrap"
log "github.com/hashicorp/go-hclog"
multierror "github.com/hashicorp/go-multierror"
Vault-1403 Switch Expiration Manager to use Fairsharing Backpressure (#1709) (#10932)
* basic pool and start testing
* refactor a bit for testing
* workFunc, start/stop safety, testing
* cleanup function for worker quit, more tests
* redo public/private members
* improve tests, export types, switch uuid package
* fix loop capture bug, cleanup
* cleanup tests
* update worker pool file name, other improvements
* add job manager prototype
* remove remnants
* add functions to wait for job manager and worker pool to stop, other fixes
* test job manager functionality, fix bugs
* encapsulate how jobs are distributed to workers
* make worker job channel read only
* add job interface, more testing, fixes
* set name for dispatcher
* fix test races
* wire up expiration manager most of the way
* dispatcher and job manager constructors don't return errors
* logger now dependency injected
* make some members private, test fcn to get worker pool size
* make GetNumWorkers public
* Update helper/fairshare/jobmanager_test.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* update fairsharing usage, add tests
* make workerpool private
* remove custom worker names
* concurrency improvements
* remove worker pool cleanup function
* remove cleanup func from job manager, remove non blocking stop from fairshare
* update job manager for new constructor
* stop job manager when expiration manager stopped
* unset env var after test
* stop fairshare when started in tests
* stop leaking job manager goroutine
* prototype channel for waking up to assign work
* fix typo/bug and add tests
* improve job manager wake up, fix test typo
* put channel drain back
* better start/pause test for job manager
* comment cleanup
* degrade possible noisy log
* remove closure, clean up context
* improve revocation context timer
* test: reduce number of revocation workers during many tests
* Update vault/expiration.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* feedback tweaks
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
2021-02-17 22:30:27 +00:00
"github.com/hashicorp/vault/helper/fairshare"
2020-11-13 18:26:58 +00:00
"github.com/hashicorp/vault/helper/metricsutil"
2020-07-01 19:41:42 +00:00
"github.com/hashicorp/vault/helper/namespace"
2019-04-13 07:44:06 +00:00
"github.com/hashicorp/vault/sdk/framework"
2019-04-15 18:10:07 +00:00
"github.com/hashicorp/vault/sdk/helper/base62"
2019-04-12 21:54:35 +00:00
"github.com/hashicorp/vault/sdk/helper/consts"
"github.com/hashicorp/vault/sdk/helper/jsonutil"
"github.com/hashicorp/vault/sdk/helper/locksutil"
"github.com/hashicorp/vault/sdk/logical"
2020-10-30 21:45:44 +00:00
"github.com/hashicorp/vault/sdk/physical"
2020-06-26 21:13:16 +00:00
"github.com/hashicorp/vault/vault/quotas"
2019-11-08 21:14:03 +00:00
uberAtomic "go.uber.org/atomic"
2015-03-13 17:55:54 +00:00
)
2015-03-13 01:38:15 +00:00
2015-03-12 19:44:22 +00:00
const (
// expirationSubPath is the sub-path used for the expiration manager
// view. This is nested under the system view.
expirationSubPath = "expire/"
2015-03-16 18:33:59 +00:00
2015-04-10 21:21:23 +00:00
// leaseViewPrefix is the prefix used for the ID based lookup of leases.
leaseViewPrefix = "id/"
// tokenViewPrefix is the prefix used for the token based lookup of leases.
tokenViewPrefix = "token/"
2015-03-16 18:33:59 +00:00
// maxRevokeAttempts limits how many revoke attempts are made
maxRevokeAttempts = 6
// revokeRetryBase is a baseline retry time
revokeRetryBase = 10 * time . Second
2015-07-30 13:42:49 +00:00
// maxLeaseDuration is the default maximum lease duration
2016-09-28 22:32:49 +00:00
maxLeaseTTL = 32 * 24 * time . Hour
2015-04-03 00:45:42 +00:00
2015-07-30 13:42:49 +00:00
// defaultLeaseDuration is the default lease duration used when no lease is specified
2015-08-27 14:50:16 +00:00
defaultLeaseTTL = maxLeaseTTL
2018-02-14 14:22:46 +00:00
2019-11-08 21:14:03 +00:00
// maxLeaseThreshold is the maximum lease count before generating log warning
2018-02-14 14:22:46 +00:00
maxLeaseThreshold = 256000
Vault-1403 Switch Expiration Manager to use Fairsharing Backpressure (#1709) (#10932)
* basic pool and start testing
* refactor a bit for testing
* workFunc, start/stop safety, testing
* cleanup function for worker quit, more tests
* redo public/private members
* improve tests, export types, switch uuid package
* fix loop capture bug, cleanup
* cleanup tests
* update worker pool file name, other improvements
* add job manager prototype
* remove remnants
* add functions to wait for job manager and worker pool to stop, other fixes
* test job manager functionality, fix bugs
* encapsulate how jobs are distributed to workers
* make worker job channel read only
* add job interface, more testing, fixes
* set name for dispatcher
* fix test races
* wire up expiration manager most of the way
* dispatcher and job manager constructors don't return errors
* logger now dependency injected
* make some members private, test fcn to get worker pool size
* make GetNumWorkers public
* Update helper/fairshare/jobmanager_test.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* update fairsharing usage, add tests
* make workerpool private
* remove custom worker names
* concurrency improvements
* remove worker pool cleanup function
* remove cleanup func from job manager, remove non blocking stop from fairshare
* update job manager for new constructor
* stop job manager when expiration manager stopped
* unset env var after test
* stop fairshare when started in tests
* stop leaking job manager goroutine
* prototype channel for waking up to assign work
* fix typo/bug and add tests
* improve job manager wake up, fix test typo
* put channel drain back
* better start/pause test for job manager
* comment cleanup
* degrade possible noisy log
* remove closure, clean up context
* improve revocation context timer
* test: reduce number of revocation workers during many tests
* Update vault/expiration.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* feedback tweaks
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
2021-02-17 22:30:27 +00:00
// numExpirationWorkersDefault is the maximum amount of workers working on lease expiration
numExpirationWorkersDefault = 200
// number of workers to use for general purpose testing
numExpirationWorkersTest = 10
fairshareWorkersOverrideVar = "VAULT_LEASE_REVOCATION_WORKERS"
2021-04-29 15:12:02 +00:00
// limit zombie error messages to 240 characters to be respectful of storage
// requirements
maxZombieErrorLength = 240
2015-03-12 19:44:22 +00:00
)
2018-06-11 15:58:56 +00:00
type pendingInfo struct {
2020-06-15 23:54:36 +00:00
// A subset of the lease entry, cached in memory
Vault-1403 Switch Expiration Manager to use Fairsharing Backpressure (#1709) (#10932)
* basic pool and start testing
* refactor a bit for testing
* workFunc, start/stop safety, testing
* cleanup function for worker quit, more tests
* redo public/private members
* improve tests, export types, switch uuid package
* fix loop capture bug, cleanup
* cleanup tests
* update worker pool file name, other improvements
* add job manager prototype
* remove remnants
* add functions to wait for job manager and worker pool to stop, other fixes
* test job manager functionality, fix bugs
* encapsulate how jobs are distributed to workers
* make worker job channel read only
* add job interface, more testing, fixes
* set name for dispatcher
* fix test races
* wire up expiration manager most of the way
* dispatcher and job manager constructors don't return errors
* logger now dependency injected
* make some members private, test fcn to get worker pool size
* make GetNumWorkers public
* Update helper/fairshare/jobmanager_test.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* update fairsharing usage, add tests
* make workerpool private
* remove custom worker names
* concurrency improvements
* remove worker pool cleanup function
* remove cleanup func from job manager, remove non blocking stop from fairshare
* update job manager for new constructor
* stop job manager when expiration manager stopped
* unset env var after test
* stop fairshare when started in tests
* stop leaking job manager goroutine
* prototype channel for waking up to assign work
* fix typo/bug and add tests
* improve job manager wake up, fix test typo
* put channel drain back
* better start/pause test for job manager
* comment cleanup
* degrade possible noisy log
* remove closure, clean up context
* improve revocation context timer
* test: reduce number of revocation workers during many tests
* Update vault/expiration.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* feedback tweaks
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
2021-02-17 22:30:27 +00:00
cachedLeaseInfo * leaseEntry
timer * time . Timer
revokesAttempted uint8
2018-06-11 15:58:56 +00:00
}
2015-03-12 19:44:22 +00:00
// ExpirationManager is used by the Core to manage leases. Secrets
// can provide a lease, meaning that they can be renewed or revoked.
// If a secret is not renewed in timely manner, it may be expired, and
// the ExpirationManager will handle doing automatic revocation.
type ExpirationManager struct {
2018-08-23 01:53:04 +00:00
core * Core
2015-03-24 01:00:14 +00:00
router * Router
2015-04-10 21:21:23 +00:00
idView * BarrierView
tokenView * BarrierView
2015-03-24 01:00:14 +00:00
tokenStore * TokenStore
2016-08-19 20:45:17 +00:00
logger log . Logger
2015-03-16 01:06:19 +00:00
2020-05-21 17:41:03 +00:00
// Although the data structure itself is atomic,
// pendingLock should be held to ensure lease modifications
// are atomic (with respect to storage, expiration time,
// and particularly the lease count.)
2020-06-15 23:54:36 +00:00
// The nonexpiring map holds entries for root tokens with
// TTL zero, which we want to count but have no timer associated.
2020-05-21 17:41:03 +00:00
pending sync . Map
2020-06-15 23:54:36 +00:00
nonexpiring sync . Map
2020-05-21 17:41:03 +00:00
leaseCount int
2017-09-05 15:09:00 +00:00
pendingLock sync . RWMutex
2017-04-26 19:07:58 +00:00
2021-04-29 15:12:02 +00:00
// Track expired leases that have been determined to be irrevocable (without
// manual intervention). These irrevocable leases are referred to as
// "zombies" or "zombie leases"
zombies sync . Map
2020-06-15 23:54:36 +00:00
// The uniquePolicies map holds policy sets, so they can
// be deduplicated. It is periodically emptied to prevent
// unbounded growth.
uniquePolicies map [ string ] [ ] string
emptyUniquePolicies * time . Ticker
2018-06-09 19:35:22 +00:00
tidyLock * int32
2017-09-05 15:09:00 +00:00
2018-06-09 19:35:22 +00:00
restoreMode * int32
2017-09-05 15:09:00 +00:00
restoreModeLock sync . RWMutex
restoreRequestLock sync . RWMutex
restoreLocks [ ] * locksutil . LockEntry
restoreLoaded sync . Map
quitCh chan struct { }
2017-12-01 22:08:38 +00:00
2020-03-10 20:01:20 +00:00
coreStateLock * DeadlockRWMutex
2018-02-14 14:22:46 +00:00
quitContext context . Context
2018-06-09 19:35:22 +00:00
leaseCheckCounter * uint32
2018-04-04 07:07:10 +00:00
logLeaseExpirations bool
2018-09-18 03:03:00 +00:00
expireFunc ExpireLeaseStrategy
2019-11-08 21:14:03 +00:00
2020-10-30 21:45:44 +00:00
revokePermitPool * physical . PermitPool
2019-11-08 21:14:03 +00:00
// testRegisterAuthFailure, if set to true, triggers an explicit failure on
// RegisterAuth to simulate a partial failure during a token creation
// request. This value should only be set by tests.
testRegisterAuthFailure uberAtomic . Bool
Vault-1403 Switch Expiration Manager to use Fairsharing Backpressure (#1709) (#10932)
* basic pool and start testing
* refactor a bit for testing
* workFunc, start/stop safety, testing
* cleanup function for worker quit, more tests
* redo public/private members
* improve tests, export types, switch uuid package
* fix loop capture bug, cleanup
* cleanup tests
* update worker pool file name, other improvements
* add job manager prototype
* remove remnants
* add functions to wait for job manager and worker pool to stop, other fixes
* test job manager functionality, fix bugs
* encapsulate how jobs are distributed to workers
* make worker job channel read only
* add job interface, more testing, fixes
* set name for dispatcher
* fix test races
* wire up expiration manager most of the way
* dispatcher and job manager constructors don't return errors
* logger now dependency injected
* make some members private, test fcn to get worker pool size
* make GetNumWorkers public
* Update helper/fairshare/jobmanager_test.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* update fairsharing usage, add tests
* make workerpool private
* remove custom worker names
* concurrency improvements
* remove worker pool cleanup function
* remove cleanup func from job manager, remove non blocking stop from fairshare
* update job manager for new constructor
* stop job manager when expiration manager stopped
* unset env var after test
* stop fairshare when started in tests
* stop leaking job manager goroutine
* prototype channel for waking up to assign work
* fix typo/bug and add tests
* improve job manager wake up, fix test typo
* put channel drain back
* better start/pause test for job manager
* comment cleanup
* degrade possible noisy log
* remove closure, clean up context
* improve revocation context timer
* test: reduce number of revocation workers during many tests
* Update vault/expiration.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* feedback tweaks
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
2021-02-17 22:30:27 +00:00
jobManager * fairshare . JobManager
2018-09-18 03:03:00 +00:00
}
2021-01-19 22:51:41 +00:00
type ExpireLeaseStrategy func ( context . Context , * ExpirationManager , string , * namespace . Namespace )
2018-09-18 03:03:00 +00:00
Vault-1403 Switch Expiration Manager to use Fairsharing Backpressure (#1709) (#10932)
* basic pool and start testing
* refactor a bit for testing
* workFunc, start/stop safety, testing
* cleanup function for worker quit, more tests
* redo public/private members
* improve tests, export types, switch uuid package
* fix loop capture bug, cleanup
* cleanup tests
* update worker pool file name, other improvements
* add job manager prototype
* remove remnants
* add functions to wait for job manager and worker pool to stop, other fixes
* test job manager functionality, fix bugs
* encapsulate how jobs are distributed to workers
* make worker job channel read only
* add job interface, more testing, fixes
* set name for dispatcher
* fix test races
* wire up expiration manager most of the way
* dispatcher and job manager constructors don't return errors
* logger now dependency injected
* make some members private, test fcn to get worker pool size
* make GetNumWorkers public
* Update helper/fairshare/jobmanager_test.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* update fairsharing usage, add tests
* make workerpool private
* remove custom worker names
* concurrency improvements
* remove worker pool cleanup function
* remove cleanup func from job manager, remove non blocking stop from fairshare
* update job manager for new constructor
* stop job manager when expiration manager stopped
* unset env var after test
* stop fairshare when started in tests
* stop leaking job manager goroutine
* prototype channel for waking up to assign work
* fix typo/bug and add tests
* improve job manager wake up, fix test typo
* put channel drain back
* better start/pause test for job manager
* comment cleanup
* degrade possible noisy log
* remove closure, clean up context
* improve revocation context timer
* test: reduce number of revocation workers during many tests
* Update vault/expiration.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* feedback tweaks
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
2021-02-17 22:30:27 +00:00
// revocationJob should only be created through newRevocationJob()
type revocationJob struct {
2021-02-27 00:00:39 +00:00
leaseID string
ns * namespace . Namespace
m * ExpirationManager
nsCtx context . Context
startTime time . Time
Vault-1403 Switch Expiration Manager to use Fairsharing Backpressure (#1709) (#10932)
* basic pool and start testing
* refactor a bit for testing
* workFunc, start/stop safety, testing
* cleanup function for worker quit, more tests
* redo public/private members
* improve tests, export types, switch uuid package
* fix loop capture bug, cleanup
* cleanup tests
* update worker pool file name, other improvements
* add job manager prototype
* remove remnants
* add functions to wait for job manager and worker pool to stop, other fixes
* test job manager functionality, fix bugs
* encapsulate how jobs are distributed to workers
* make worker job channel read only
* add job interface, more testing, fixes
* set name for dispatcher
* fix test races
* wire up expiration manager most of the way
* dispatcher and job manager constructors don't return errors
* logger now dependency injected
* make some members private, test fcn to get worker pool size
* make GetNumWorkers public
* Update helper/fairshare/jobmanager_test.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* update fairsharing usage, add tests
* make workerpool private
* remove custom worker names
* concurrency improvements
* remove worker pool cleanup function
* remove cleanup func from job manager, remove non blocking stop from fairshare
* update job manager for new constructor
* stop job manager when expiration manager stopped
* unset env var after test
* stop fairshare when started in tests
* stop leaking job manager goroutine
* prototype channel for waking up to assign work
* fix typo/bug and add tests
* improve job manager wake up, fix test typo
* put channel drain back
* better start/pause test for job manager
* comment cleanup
* degrade possible noisy log
* remove closure, clean up context
* improve revocation context timer
* test: reduce number of revocation workers during many tests
* Update vault/expiration.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* feedback tweaks
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
2021-02-17 22:30:27 +00:00
}
2021-02-27 00:00:39 +00:00
func newRevocationJob ( nsCtx context . Context , leaseID string , ns * namespace . Namespace , m * ExpirationManager ) ( * revocationJob , error ) {
Vault-1403 Switch Expiration Manager to use Fairsharing Backpressure (#1709) (#10932)
* basic pool and start testing
* refactor a bit for testing
* workFunc, start/stop safety, testing
* cleanup function for worker quit, more tests
* redo public/private members
* improve tests, export types, switch uuid package
* fix loop capture bug, cleanup
* cleanup tests
* update worker pool file name, other improvements
* add job manager prototype
* remove remnants
* add functions to wait for job manager and worker pool to stop, other fixes
* test job manager functionality, fix bugs
* encapsulate how jobs are distributed to workers
* make worker job channel read only
* add job interface, more testing, fixes
* set name for dispatcher
* fix test races
* wire up expiration manager most of the way
* dispatcher and job manager constructors don't return errors
* logger now dependency injected
* make some members private, test fcn to get worker pool size
* make GetNumWorkers public
* Update helper/fairshare/jobmanager_test.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* update fairsharing usage, add tests
* make workerpool private
* remove custom worker names
* concurrency improvements
* remove worker pool cleanup function
* remove cleanup func from job manager, remove non blocking stop from fairshare
* update job manager for new constructor
* stop job manager when expiration manager stopped
* unset env var after test
* stop fairshare when started in tests
* stop leaking job manager goroutine
* prototype channel for waking up to assign work
* fix typo/bug and add tests
* improve job manager wake up, fix test typo
* put channel drain back
* better start/pause test for job manager
* comment cleanup
* degrade possible noisy log
* remove closure, clean up context
* improve revocation context timer
* test: reduce number of revocation workers during many tests
* Update vault/expiration.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* feedback tweaks
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
2021-02-17 22:30:27 +00:00
if leaseID == "" {
return nil , fmt . Errorf ( "cannot have empty lease id" )
}
if m == nil {
return nil , fmt . Errorf ( "cannot have nil expiration manager" )
}
if nsCtx == nil {
return nil , fmt . Errorf ( "cannot have nil namespace context.Context" )
}
return & revocationJob {
2021-02-27 00:00:39 +00:00
leaseID : leaseID ,
ns : ns ,
m : m ,
nsCtx : nsCtx ,
startTime : time . Now ( ) ,
Vault-1403 Switch Expiration Manager to use Fairsharing Backpressure (#1709) (#10932)
* basic pool and start testing
* refactor a bit for testing
* workFunc, start/stop safety, testing
* cleanup function for worker quit, more tests
* redo public/private members
* improve tests, export types, switch uuid package
* fix loop capture bug, cleanup
* cleanup tests
* update worker pool file name, other improvements
* add job manager prototype
* remove remnants
* add functions to wait for job manager and worker pool to stop, other fixes
* test job manager functionality, fix bugs
* encapsulate how jobs are distributed to workers
* make worker job channel read only
* add job interface, more testing, fixes
* set name for dispatcher
* fix test races
* wire up expiration manager most of the way
* dispatcher and job manager constructors don't return errors
* logger now dependency injected
* make some members private, test fcn to get worker pool size
* make GetNumWorkers public
* Update helper/fairshare/jobmanager_test.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* update fairsharing usage, add tests
* make workerpool private
* remove custom worker names
* concurrency improvements
* remove worker pool cleanup function
* remove cleanup func from job manager, remove non blocking stop from fairshare
* update job manager for new constructor
* stop job manager when expiration manager stopped
* unset env var after test
* stop fairshare when started in tests
* stop leaking job manager goroutine
* prototype channel for waking up to assign work
* fix typo/bug and add tests
* improve job manager wake up, fix test typo
* put channel drain back
* better start/pause test for job manager
* comment cleanup
* degrade possible noisy log
* remove closure, clean up context
* improve revocation context timer
* test: reduce number of revocation workers during many tests
* Update vault/expiration.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* feedback tweaks
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
2021-02-17 22:30:27 +00:00
} , nil
}
func ( r * revocationJob ) Execute ( ) error {
2021-02-27 00:00:39 +00:00
r . m . core . metricSink . IncrCounterWithLabels ( [ ] string { "expire" , "lease_expiration" } , 1 , [ ] metrics . Label { metricsutil . NamespaceLabel ( r . ns ) } )
r . m . core . metricSink . MeasureSinceWithLabels ( [ ] string { "expire" , "lease_expiration" , "time_in_queue" } , r . startTime , [ ] metrics . Label { metricsutil . NamespaceLabel ( r . ns ) } )
Vault-1403 Switch Expiration Manager to use Fairsharing Backpressure (#1709) (#10932)
* basic pool and start testing
* refactor a bit for testing
* workFunc, start/stop safety, testing
* cleanup function for worker quit, more tests
* redo public/private members
* improve tests, export types, switch uuid package
* fix loop capture bug, cleanup
* cleanup tests
* update worker pool file name, other improvements
* add job manager prototype
* remove remnants
* add functions to wait for job manager and worker pool to stop, other fixes
* test job manager functionality, fix bugs
* encapsulate how jobs are distributed to workers
* make worker job channel read only
* add job interface, more testing, fixes
* set name for dispatcher
* fix test races
* wire up expiration manager most of the way
* dispatcher and job manager constructors don't return errors
* logger now dependency injected
* make some members private, test fcn to get worker pool size
* make GetNumWorkers public
* Update helper/fairshare/jobmanager_test.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* update fairsharing usage, add tests
* make workerpool private
* remove custom worker names
* concurrency improvements
* remove worker pool cleanup function
* remove cleanup func from job manager, remove non blocking stop from fairshare
* update job manager for new constructor
* stop job manager when expiration manager stopped
* unset env var after test
* stop fairshare when started in tests
* stop leaking job manager goroutine
* prototype channel for waking up to assign work
* fix typo/bug and add tests
* improve job manager wake up, fix test typo
* put channel drain back
* better start/pause test for job manager
* comment cleanup
* degrade possible noisy log
* remove closure, clean up context
* improve revocation context timer
* test: reduce number of revocation workers during many tests
* Update vault/expiration.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* feedback tweaks
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
2021-02-17 22:30:27 +00:00
// don't start the timer until the revocation is being executed
revokeCtx , cancel := context . WithTimeout ( r . nsCtx , DefaultMaxRequestDuration )
defer cancel ( )
go func ( ) {
select {
case <- r . m . quitCh :
cancel ( )
case <- revokeCtx . Done ( ) :
}
} ( )
select {
case <- r . m . quitCh :
r . m . logger . Error ( "shutting down, not attempting further revocation of lease" , "lease_id" , r . leaseID )
return nil
case <- r . m . quitContext . Done ( ) :
r . m . logger . Error ( "core context canceled, not attempting further revocation of lease" , "lease_id" , r . leaseID )
return nil
default :
}
r . m . coreStateLock . RLock ( )
err := r . m . Revoke ( revokeCtx , r . leaseID )
r . m . coreStateLock . RUnlock ( )
return err
}
func ( r * revocationJob ) OnFailure ( err error ) {
2021-02-27 00:00:39 +00:00
r . m . core . metricSink . IncrCounterWithLabels ( [ ] string { "expire" , "lease_expiration" , "error" } , 1 , [ ] metrics . Label { metricsutil . NamespaceLabel ( r . ns ) } )
Vault-1403 Switch Expiration Manager to use Fairsharing Backpressure (#1709) (#10932)
* basic pool and start testing
* refactor a bit for testing
* workFunc, start/stop safety, testing
* cleanup function for worker quit, more tests
* redo public/private members
* improve tests, export types, switch uuid package
* fix loop capture bug, cleanup
* cleanup tests
* update worker pool file name, other improvements
* add job manager prototype
* remove remnants
* add functions to wait for job manager and worker pool to stop, other fixes
* test job manager functionality, fix bugs
* encapsulate how jobs are distributed to workers
* make worker job channel read only
* add job interface, more testing, fixes
* set name for dispatcher
* fix test races
* wire up expiration manager most of the way
* dispatcher and job manager constructors don't return errors
* logger now dependency injected
* make some members private, test fcn to get worker pool size
* make GetNumWorkers public
* Update helper/fairshare/jobmanager_test.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* update fairsharing usage, add tests
* make workerpool private
* remove custom worker names
* concurrency improvements
* remove worker pool cleanup function
* remove cleanup func from job manager, remove non blocking stop from fairshare
* update job manager for new constructor
* stop job manager when expiration manager stopped
* unset env var after test
* stop fairshare when started in tests
* stop leaking job manager goroutine
* prototype channel for waking up to assign work
* fix typo/bug and add tests
* improve job manager wake up, fix test typo
* put channel drain back
* better start/pause test for job manager
* comment cleanup
* degrade possible noisy log
* remove closure, clean up context
* improve revocation context timer
* test: reduce number of revocation workers during many tests
* Update vault/expiration.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* feedback tweaks
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
2021-02-17 22:30:27 +00:00
r . m . logger . Error ( "failed to revoke lease" , "lease_id" , r . leaseID , "error" , err )
r . m . pendingLock . Lock ( )
defer r . m . pendingLock . Unlock ( )
pendingRaw , ok := r . m . pending . Load ( r . leaseID )
if ! ok {
r . m . logger . Warn ( "failed to find lease in pending map for revocation retry" , "lease_id" , r . leaseID )
return
}
pending := pendingRaw . ( pendingInfo )
pending . revokesAttempted ++
if pending . revokesAttempted >= maxRevokeAttempts {
r . m . logger . Trace ( "lease has consumed all retry attempts" , "lease_id" , r . leaseID )
2021-04-29 15:12:02 +00:00
le , loadErr := r . m . loadEntry ( r . nsCtx , r . leaseID )
if loadErr != nil {
r . m . logger . Warn ( "failed to mark lease as zombie - failed to load" , "lease_id" , r . leaseID , "err" , loadErr )
return
}
r . m . markLeaseAsZombie ( r . nsCtx , le , errors . New ( "lease has consumed all retry attempts" ) )
Vault-1403 Switch Expiration Manager to use Fairsharing Backpressure (#1709) (#10932)
* basic pool and start testing
* refactor a bit for testing
* workFunc, start/stop safety, testing
* cleanup function for worker quit, more tests
* redo public/private members
* improve tests, export types, switch uuid package
* fix loop capture bug, cleanup
* cleanup tests
* update worker pool file name, other improvements
* add job manager prototype
* remove remnants
* add functions to wait for job manager and worker pool to stop, other fixes
* test job manager functionality, fix bugs
* encapsulate how jobs are distributed to workers
* make worker job channel read only
* add job interface, more testing, fixes
* set name for dispatcher
* fix test races
* wire up expiration manager most of the way
* dispatcher and job manager constructors don't return errors
* logger now dependency injected
* make some members private, test fcn to get worker pool size
* make GetNumWorkers public
* Update helper/fairshare/jobmanager_test.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* update fairsharing usage, add tests
* make workerpool private
* remove custom worker names
* concurrency improvements
* remove worker pool cleanup function
* remove cleanup func from job manager, remove non blocking stop from fairshare
* update job manager for new constructor
* stop job manager when expiration manager stopped
* unset env var after test
* stop fairshare when started in tests
* stop leaking job manager goroutine
* prototype channel for waking up to assign work
* fix typo/bug and add tests
* improve job manager wake up, fix test typo
* put channel drain back
* better start/pause test for job manager
* comment cleanup
* degrade possible noisy log
* remove closure, clean up context
* improve revocation context timer
* test: reduce number of revocation workers during many tests
* Update vault/expiration.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* feedback tweaks
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
2021-02-17 22:30:27 +00:00
return
}
2021-02-19 04:20:01 +00:00
pending . timer . Reset ( revokeExponentialBackoff ( pending . revokesAttempted ) )
Vault-1403 Switch Expiration Manager to use Fairsharing Backpressure (#1709) (#10932)
* basic pool and start testing
* refactor a bit for testing
* workFunc, start/stop safety, testing
* cleanup function for worker quit, more tests
* redo public/private members
* improve tests, export types, switch uuid package
* fix loop capture bug, cleanup
* cleanup tests
* update worker pool file name, other improvements
* add job manager prototype
* remove remnants
* add functions to wait for job manager and worker pool to stop, other fixes
* test job manager functionality, fix bugs
* encapsulate how jobs are distributed to workers
* make worker job channel read only
* add job interface, more testing, fixes
* set name for dispatcher
* fix test races
* wire up expiration manager most of the way
* dispatcher and job manager constructors don't return errors
* logger now dependency injected
* make some members private, test fcn to get worker pool size
* make GetNumWorkers public
* Update helper/fairshare/jobmanager_test.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* update fairsharing usage, add tests
* make workerpool private
* remove custom worker names
* concurrency improvements
* remove worker pool cleanup function
* remove cleanup func from job manager, remove non blocking stop from fairshare
* update job manager for new constructor
* stop job manager when expiration manager stopped
* unset env var after test
* stop fairshare when started in tests
* stop leaking job manager goroutine
* prototype channel for waking up to assign work
* fix typo/bug and add tests
* improve job manager wake up, fix test typo
* put channel drain back
* better start/pause test for job manager
* comment cleanup
* degrade possible noisy log
* remove closure, clean up context
* improve revocation context timer
* test: reduce number of revocation workers during many tests
* Update vault/expiration.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* feedback tweaks
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
2021-02-17 22:30:27 +00:00
r . m . pending . Store ( r . leaseID , pending )
}
func expireLeaseStrategyFairsharing ( ctx context . Context , m * ExpirationManager , leaseID string , ns * namespace . Namespace ) {
nsCtx := namespace . ContextWithNamespace ( ctx , ns )
var mountAccessor string
m . coreStateLock . RLock ( )
mount := m . core . router . MatchingMountEntry ( nsCtx , leaseID )
m . coreStateLock . RUnlock ( )
if mount == nil {
// figure out what this means - if we couldn't find the mount, can we automatically revoke
m . logger . Debug ( "could not find lease path" , "lease_id" , leaseID )
mountAccessor = "mount-accessor-not-found"
} else {
mountAccessor = mount . Accessor
}
2021-02-27 00:00:39 +00:00
job , err := newRevocationJob ( nsCtx , leaseID , ns , m )
Vault-1403 Switch Expiration Manager to use Fairsharing Backpressure (#1709) (#10932)
* basic pool and start testing
* refactor a bit for testing
* workFunc, start/stop safety, testing
* cleanup function for worker quit, more tests
* redo public/private members
* improve tests, export types, switch uuid package
* fix loop capture bug, cleanup
* cleanup tests
* update worker pool file name, other improvements
* add job manager prototype
* remove remnants
* add functions to wait for job manager and worker pool to stop, other fixes
* test job manager functionality, fix bugs
* encapsulate how jobs are distributed to workers
* make worker job channel read only
* add job interface, more testing, fixes
* set name for dispatcher
* fix test races
* wire up expiration manager most of the way
* dispatcher and job manager constructors don't return errors
* logger now dependency injected
* make some members private, test fcn to get worker pool size
* make GetNumWorkers public
* Update helper/fairshare/jobmanager_test.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* update fairsharing usage, add tests
* make workerpool private
* remove custom worker names
* concurrency improvements
* remove worker pool cleanup function
* remove cleanup func from job manager, remove non blocking stop from fairshare
* update job manager for new constructor
* stop job manager when expiration manager stopped
* unset env var after test
* stop fairshare when started in tests
* stop leaking job manager goroutine
* prototype channel for waking up to assign work
* fix typo/bug and add tests
* improve job manager wake up, fix test typo
* put channel drain back
* better start/pause test for job manager
* comment cleanup
* degrade possible noisy log
* remove closure, clean up context
* improve revocation context timer
* test: reduce number of revocation workers during many tests
* Update vault/expiration.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* feedback tweaks
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
2021-02-17 22:30:27 +00:00
if err != nil {
m . logger . Warn ( "error creating revocation job" , "error" , err )
return
}
m . jobManager . AddJob ( job , mountAccessor )
}
2021-02-19 04:20:01 +00:00
func revokeExponentialBackoff ( attempt uint8 ) time . Duration {
exp := ( 1 << attempt ) * revokeRetryBase
randomDelta := 0.5 * float64 ( exp )
// Allow backoff time to be a random value between exp +/- (0.5*exp)
backoffTime := ( float64 ( exp ) - randomDelta ) + ( rand . Float64 ( ) * ( 2 * randomDelta ) )
return time . Duration ( backoffTime )
}
2018-09-18 03:03:00 +00:00
// revokeIDFunc is invoked when a given ID is expired
2021-01-19 22:51:41 +00:00
func expireLeaseStrategyRevoke ( ctx context . Context , m * ExpirationManager , leaseID string , ns * namespace . Namespace ) {
2018-09-18 03:03:00 +00:00
for attempt := uint ( 0 ) ; attempt < maxRevokeAttempts ; attempt ++ {
2020-10-30 21:45:44 +00:00
releasePermit := func ( ) { }
if m . revokePermitPool != nil {
m . logger . Trace ( "expiring lease; waiting for permit pool" )
m . revokePermitPool . Acquire ( )
releasePermit = m . revokePermitPool . Release
m . logger . Trace ( "expiring lease; got permit pool" )
}
2021-01-19 22:51:41 +00:00
metrics . IncrCounterWithLabels ( [ ] string { "expire" , "lease_expiration" } , 1 , [ ] metrics . Label { { "namespace" , ns . ID } } )
2020-10-30 21:45:44 +00:00
2018-09-18 03:03:00 +00:00
revokeCtx , cancel := context . WithTimeout ( ctx , DefaultMaxRequestDuration )
2021-01-19 22:51:41 +00:00
revokeCtx = namespace . ContextWithNamespace ( revokeCtx , ns )
2018-09-18 03:03:00 +00:00
go func ( ) {
select {
case <- ctx . Done ( ) :
case <- m . quitCh :
cancel ( )
2018-10-13 02:02:59 +00:00
case <- revokeCtx . Done ( ) :
2018-09-18 03:03:00 +00:00
}
} ( )
select {
case <- m . quitCh :
2021-01-19 22:51:41 +00:00
m . logger . Error ( "shutting down, not attempting further revocation of lease" , "lease_id" , leaseID )
2020-10-30 21:45:44 +00:00
releasePermit ( )
2018-10-13 02:02:59 +00:00
cancel ( )
2018-09-18 03:03:00 +00:00
return
case <- m . quitContext . Done ( ) :
2021-01-19 22:51:41 +00:00
m . logger . Error ( "core context canceled, not attempting further revocation of lease" , "lease_id" , leaseID )
2020-10-30 21:45:44 +00:00
releasePermit ( )
2018-10-13 02:02:59 +00:00
cancel ( )
2018-09-18 03:03:00 +00:00
return
default :
}
m . coreStateLock . RLock ( )
2021-01-19 22:51:41 +00:00
err := m . Revoke ( revokeCtx , leaseID )
2018-09-18 03:03:00 +00:00
m . coreStateLock . RUnlock ( )
2020-10-30 21:45:44 +00:00
releasePermit ( )
2018-09-18 03:03:00 +00:00
cancel ( )
if err == nil {
return
}
2021-01-19 22:51:41 +00:00
metrics . IncrCounterWithLabels ( [ ] string { "expire" , "lease_expiration" , "error" } , 1 , [ ] metrics . Label { { "namespace" , ns . ID } } )
2020-10-30 21:45:44 +00:00
2021-01-19 22:51:41 +00:00
m . logger . Error ( "failed to revoke lease" , "lease_id" , leaseID , "error" , err )
2018-09-18 03:03:00 +00:00
time . Sleep ( ( 1 << attempt ) * revokeRetryBase )
}
2021-01-19 22:51:41 +00:00
m . logger . Error ( "maximum revoke attempts reached" , "lease_id" , leaseID )
2015-03-12 19:44:22 +00:00
}
Vault-1403 Switch Expiration Manager to use Fairsharing Backpressure (#1709) (#10932)
* basic pool and start testing
* refactor a bit for testing
* workFunc, start/stop safety, testing
* cleanup function for worker quit, more tests
* redo public/private members
* improve tests, export types, switch uuid package
* fix loop capture bug, cleanup
* cleanup tests
* update worker pool file name, other improvements
* add job manager prototype
* remove remnants
* add functions to wait for job manager and worker pool to stop, other fixes
* test job manager functionality, fix bugs
* encapsulate how jobs are distributed to workers
* make worker job channel read only
* add job interface, more testing, fixes
* set name for dispatcher
* fix test races
* wire up expiration manager most of the way
* dispatcher and job manager constructors don't return errors
* logger now dependency injected
* make some members private, test fcn to get worker pool size
* make GetNumWorkers public
* Update helper/fairshare/jobmanager_test.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* update fairsharing usage, add tests
* make workerpool private
* remove custom worker names
* concurrency improvements
* remove worker pool cleanup function
* remove cleanup func from job manager, remove non blocking stop from fairshare
* update job manager for new constructor
* stop job manager when expiration manager stopped
* unset env var after test
* stop fairshare when started in tests
* stop leaking job manager goroutine
* prototype channel for waking up to assign work
* fix typo/bug and add tests
* improve job manager wake up, fix test typo
* put channel drain back
* better start/pause test for job manager
* comment cleanup
* degrade possible noisy log
* remove closure, clean up context
* improve revocation context timer
* test: reduce number of revocation workers during many tests
* Update vault/expiration.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* feedback tweaks
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
2021-02-17 22:30:27 +00:00
func getNumExpirationWorkers ( c * Core , l log . Logger ) int {
numWorkers := c . numExpirationWorkers
workerOverride := os . Getenv ( fairshareWorkersOverrideVar )
if workerOverride != "" {
i , err := strconv . Atoi ( workerOverride )
if err != nil {
l . Warn ( "vault lease revocation workers override must be an integer" , "value" , workerOverride )
} else if i < 1 || i > 10000 {
l . Warn ( "vault lease revocation workers override out of range" , "value" , i )
} else {
numWorkers = i
}
}
return numWorkers
}
2015-03-12 19:44:22 +00:00
// NewExpirationManager creates a new ExpirationManager that is backed
2015-03-13 01:38:15 +00:00
// using a given view, and uses the provided router for revocation.
2018-09-18 03:03:00 +00:00
func NewExpirationManager ( c * Core , view * BarrierView , e ExpireLeaseStrategy , logger log . Logger ) * ExpirationManager {
2020-10-30 21:45:44 +00:00
var permitPool * physical . PermitPool
if os . Getenv ( "VAULT_16_REVOKE_PERMITPOOL" ) != "" {
permitPoolSize := 50
permitPoolSizeRaw , err := strconv . Atoi ( os . Getenv ( "VAULT_16_REVOKE_PERMITPOOL" ) )
if err == nil && permitPoolSizeRaw > 0 {
permitPoolSize = permitPoolSizeRaw
}
permitPool = physical . NewPermitPool ( permitPoolSize )
}
2021-02-27 00:00:39 +00:00
jobManager := fairshare . NewJobManager ( "expire" , getNumExpirationWorkers ( c , logger ) , logger . Named ( "job-manager" ) , c . metricSink )
Vault-1403 Switch Expiration Manager to use Fairsharing Backpressure (#1709) (#10932)
* basic pool and start testing
* refactor a bit for testing
* workFunc, start/stop safety, testing
* cleanup function for worker quit, more tests
* redo public/private members
* improve tests, export types, switch uuid package
* fix loop capture bug, cleanup
* cleanup tests
* update worker pool file name, other improvements
* add job manager prototype
* remove remnants
* add functions to wait for job manager and worker pool to stop, other fixes
* test job manager functionality, fix bugs
* encapsulate how jobs are distributed to workers
* make worker job channel read only
* add job interface, more testing, fixes
* set name for dispatcher
* fix test races
* wire up expiration manager most of the way
* dispatcher and job manager constructors don't return errors
* logger now dependency injected
* make some members private, test fcn to get worker pool size
* make GetNumWorkers public
* Update helper/fairshare/jobmanager_test.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* update fairsharing usage, add tests
* make workerpool private
* remove custom worker names
* concurrency improvements
* remove worker pool cleanup function
* remove cleanup func from job manager, remove non blocking stop from fairshare
* update job manager for new constructor
* stop job manager when expiration manager stopped
* unset env var after test
* stop fairshare when started in tests
* stop leaking job manager goroutine
* prototype channel for waking up to assign work
* fix typo/bug and add tests
* improve job manager wake up, fix test typo
* put channel drain back
* better start/pause test for job manager
* comment cleanup
* degrade possible noisy log
* remove closure, clean up context
* improve revocation context timer
* test: reduce number of revocation workers during many tests
* Update vault/expiration.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* feedback tweaks
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
2021-02-17 22:30:27 +00:00
jobManager . Start ( )
2015-03-12 19:44:22 +00:00
exp := & ExpirationManager {
2020-06-15 23:54:36 +00:00
core : c ,
router : c . router ,
idView : view . SubView ( leaseViewPrefix ) ,
tokenView : view . SubView ( tokenViewPrefix ) ,
tokenStore : c . tokenStore ,
logger : logger ,
pending : sync . Map { } ,
nonexpiring : sync . Map { } ,
leaseCount : 0 ,
tidyLock : new ( int32 ) ,
uniquePolicies : make ( map [ string ] [ ] string ) ,
emptyUniquePolicies : time . NewTicker ( 7 * 24 * time . Hour ) ,
2017-09-05 15:09:00 +00:00
// new instances of the expiration manager will go immediately into
// restore mode
2018-06-09 19:35:22 +00:00
restoreMode : new ( int32 ) ,
2017-09-05 15:09:00 +00:00
restoreLocks : locksutil . CreateLocks ( ) ,
quitCh : make ( chan struct { } ) ,
2017-12-01 22:08:38 +00:00
2018-02-14 14:22:46 +00:00
coreStateLock : & c . stateLock ,
quitContext : c . activeContext ,
2018-06-09 19:35:22 +00:00
leaseCheckCounter : new ( uint32 ) ,
2018-04-04 07:07:10 +00:00
logLeaseExpirations : os . Getenv ( "VAULT_SKIP_LOGGING_LEASE_EXPIRATIONS" ) == "" ,
2018-09-18 03:03:00 +00:00
expireFunc : e ,
2020-10-30 21:45:44 +00:00
revokePermitPool : permitPool ,
Vault-1403 Switch Expiration Manager to use Fairsharing Backpressure (#1709) (#10932)
* basic pool and start testing
* refactor a bit for testing
* workFunc, start/stop safety, testing
* cleanup function for worker quit, more tests
* redo public/private members
* improve tests, export types, switch uuid package
* fix loop capture bug, cleanup
* cleanup tests
* update worker pool file name, other improvements
* add job manager prototype
* remove remnants
* add functions to wait for job manager and worker pool to stop, other fixes
* test job manager functionality, fix bugs
* encapsulate how jobs are distributed to workers
* make worker job channel read only
* add job interface, more testing, fixes
* set name for dispatcher
* fix test races
* wire up expiration manager most of the way
* dispatcher and job manager constructors don't return errors
* logger now dependency injected
* make some members private, test fcn to get worker pool size
* make GetNumWorkers public
* Update helper/fairshare/jobmanager_test.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* update fairsharing usage, add tests
* make workerpool private
* remove custom worker names
* concurrency improvements
* remove worker pool cleanup function
* remove cleanup func from job manager, remove non blocking stop from fairshare
* update job manager for new constructor
* stop job manager when expiration manager stopped
* unset env var after test
* stop fairshare when started in tests
* stop leaking job manager goroutine
* prototype channel for waking up to assign work
* fix typo/bug and add tests
* improve job manager wake up, fix test typo
* put channel drain back
* better start/pause test for job manager
* comment cleanup
* degrade possible noisy log
* remove closure, clean up context
* improve revocation context timer
* test: reduce number of revocation workers during many tests
* Update vault/expiration.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* feedback tweaks
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
2021-02-17 22:30:27 +00:00
jobManager : jobManager ,
2015-03-12 19:44:22 +00:00
}
2018-06-09 19:35:22 +00:00
* exp . restoreMode = 1
2017-12-01 22:08:38 +00:00
if exp . logger == nil {
2018-04-03 00:46:59 +00:00
opts := log . LoggerOptions { Name : "expiration_manager" }
exp . logger = log . New ( & opts )
2017-12-01 22:08:38 +00:00
}
2020-06-15 23:54:36 +00:00
go exp . uniquePoliciesGc ( )
2015-03-12 19:44:22 +00:00
return exp
}
// setupExpiration is invoked after we've loaded the mount table to
// initialize the expiration manager
2018-09-18 03:03:00 +00:00
func ( c * Core ) setupExpiration ( e ExpireLeaseStrategy ) error {
2015-10-12 20:33:54 +00:00
c . metricsMutex . Lock ( )
defer c . metricsMutex . Unlock ( )
2015-03-12 19:44:22 +00:00
// Create a sub-view
2015-09-04 20:58:12 +00:00
view := c . systemBarrierView . SubView ( expirationSubPath )
2015-03-12 19:44:22 +00:00
// Create the manager
2018-09-05 19:52:54 +00:00
expLogger := c . baseLogger . Named ( "expiration" )
c . AddLogger ( expLogger )
2018-09-18 03:03:00 +00:00
mgr := NewExpirationManager ( c , view , e , expLogger )
2015-03-12 19:44:22 +00:00
c . expiration = mgr
2015-03-13 18:20:36 +00:00
2015-04-03 18:40:08 +00:00
// Link the token store to this
c . tokenStore . SetExpirationManager ( mgr )
2015-03-13 18:20:36 +00:00
// Restore the existing state
2018-04-03 00:46:59 +00:00
c . logger . Info ( "restoring leases" )
2017-09-05 15:09:00 +00:00
errorFunc := func ( ) {
2018-04-03 00:46:59 +00:00
c . logger . Error ( "shutting down" )
2017-09-05 15:09:00 +00:00
if err := c . Shutdown ( ) ; err != nil {
2018-10-09 16:43:17 +00:00
c . logger . Error ( "error shutting down core" , "error" , err )
2017-09-05 15:09:00 +00:00
}
2015-03-13 18:20:36 +00:00
}
2017-09-11 18:49:08 +00:00
go c . expiration . Restore ( errorFunc )
2017-09-05 15:09:00 +00:00
2015-03-13 18:20:36 +00:00
return nil
}
// stopExpiration is used to stop the expiration manager before
// sealing the Vault.
func ( c * Core ) stopExpiration ( ) error {
2015-04-14 20:32:56 +00:00
if c . expiration != nil {
if err := c . expiration . Stop ( ) ; err != nil {
return err
}
2015-10-12 20:33:54 +00:00
c . metricsMutex . Lock ( )
defer c . metricsMutex . Unlock ( )
2015-04-14 20:32:56 +00:00
c . expiration = nil
2015-03-13 18:20:36 +00:00
}
return nil
}
2017-09-05 15:09:00 +00:00
// lockLease takes out a lock for a given lease ID
func ( m * ExpirationManager ) lockLease ( leaseID string ) {
locksutil . LockForKey ( m . restoreLocks , leaseID ) . Lock ( )
}
// unlockLease unlocks a given lease ID
func ( m * ExpirationManager ) unlockLease ( leaseID string ) {
locksutil . LockForKey ( m . restoreLocks , leaseID ) . Unlock ( )
}
// inRestoreMode returns if we are currently in restore mode
func ( m * ExpirationManager ) inRestoreMode ( ) bool {
2018-06-09 19:35:22 +00:00
return atomic . LoadInt32 ( m . restoreMode ) == 1
2017-09-05 15:09:00 +00:00
}
2018-09-18 03:03:00 +00:00
func ( m * ExpirationManager ) invalidate ( key string ) {
switch {
case strings . HasPrefix ( key , leaseViewPrefix ) :
leaseID := strings . TrimPrefix ( key , leaseViewPrefix )
2020-06-26 21:13:16 +00:00
ctx := m . quitContext
_ , nsID := namespace . SplitIDFromString ( leaseID )
leaseNS := namespace . RootNamespace
var err error
if nsID != "" {
leaseNS , err = NamespaceByID ( ctx , nsID , m . core )
if err != nil {
m . logger . Error ( "failed to invalidate lease entry" , "error" , err )
return
}
2018-09-18 03:03:00 +00:00
}
2020-06-15 23:54:36 +00:00
2020-06-26 21:13:16 +00:00
le , err := m . loadEntryInternal ( namespace . ContextWithNamespace ( ctx , leaseNS ) , leaseID , false , false )
if err != nil {
m . logger . Error ( "failed to invalidate lease entry" , "error" , err )
return
}
2020-06-15 23:54:36 +00:00
2020-06-26 21:13:16 +00:00
m . pendingLock . Lock ( )
defer m . pendingLock . Unlock ( )
info , ok := m . pending . Load ( leaseID )
switch {
case ok :
switch {
case le == nil :
// Handle lease deletion
pending := info . ( pendingInfo )
pending . timer . Stop ( )
m . pending . Delete ( leaseID )
m . leaseCount --
if err := m . core . quotasHandleLeases ( ctx , quotas . LeaseActionDeleted , [ ] string { leaseID } ) ; err != nil {
2020-07-01 19:41:42 +00:00
m . logger . Error ( "failed to update quota on lease invalidation" , "error" , err )
2020-06-26 21:13:16 +00:00
return
}
default :
// Handle lease update
2020-09-23 18:46:22 +00:00
m . updatePendingInternal ( le )
2020-06-26 21:13:16 +00:00
}
default :
// There is no entry in the pending map and the invalidation
2020-10-08 00:27:45 +00:00
// resulted in a nil entry.
2020-06-26 21:13:16 +00:00
if le == nil {
2020-10-08 00:27:45 +00:00
// If in the nonexpiring map, remove there.
m . nonexpiring . Delete ( leaseID )
2020-06-26 21:13:16 +00:00
return
}
// Handle lease creation
2020-09-23 18:46:22 +00:00
m . updatePendingInternal ( le )
2020-06-26 21:13:16 +00:00
}
2018-09-18 03:03:00 +00:00
}
}
2017-05-04 16:11:00 +00:00
// Tidy cleans up the dangling storage entries for leases. It scans the storage
// view to find all the available leases, checks if the token embedded in it is
// either empty or invalid and in both the cases, it revokes them. It also uses
// a token cache to avoid multiple lookups of the same token ID. It is normally
// not required to use the API that invokes this. This is only intended to
// clean up the corrupt storage due to bugs.
2018-09-18 03:03:00 +00:00
func ( m * ExpirationManager ) Tidy ( ctx context . Context ) error {
2017-09-05 15:09:00 +00:00
if m . inRestoreMode ( ) {
return errors . New ( "cannot run tidy while restoring leases" )
}
2017-03-07 20:22:21 +00:00
var tidyErrors * multierror . Error
2018-06-16 22:21:33 +00:00
logger := m . logger . Named ( "tidy" )
2018-09-05 19:52:54 +00:00
m . core . AddLogger ( logger )
2018-06-16 22:21:33 +00:00
2018-06-09 19:35:22 +00:00
if ! atomic . CompareAndSwapInt32 ( m . tidyLock , 0 , 1 ) {
2018-06-16 22:21:33 +00:00
logger . Warn ( "tidy operation on leases is already in progress" )
return nil
2017-04-27 20:22:19 +00:00
}
2018-06-09 19:35:22 +00:00
defer atomic . CompareAndSwapInt32 ( m . tidyLock , 1 , 0 )
2017-04-27 20:22:19 +00:00
2018-06-16 22:21:33 +00:00
logger . Info ( "beginning tidy operation on leases" )
defer logger . Info ( "finished tidy operation on leases" )
2017-04-27 20:22:19 +00:00
2017-04-26 20:54:48 +00:00
// Create a cache to keep track of looked up tokens
2017-04-27 15:31:42 +00:00
tokenCache := make ( map [ string ] bool )
2017-05-05 14:48:12 +00:00
var countLease , revokedCount , deletedCountInvalidToken , deletedCountEmptyToken int64
2017-04-26 20:54:48 +00:00
2017-03-07 20:22:21 +00:00
tidyFunc := func ( leaseID string ) {
2017-05-03 14:54:07 +00:00
countLease ++
if countLease % 500 == 0 {
2018-06-16 22:21:33 +00:00
logger . Info ( "tidying leases" , "progress" , countLease )
2017-04-27 14:56:19 +00:00
}
2017-04-26 19:48:28 +00:00
2018-09-18 03:03:00 +00:00
le , err := m . loadEntry ( ctx , leaseID )
2017-03-07 20:22:21 +00:00
if err != nil {
2018-04-05 15:49:21 +00:00
tidyErrors = multierror . Append ( tidyErrors , errwrap . Wrapf ( fmt . Sprintf ( "failed to load the lease ID %q: {{err}}" , leaseID ) , err ) )
2017-03-07 20:22:21 +00:00
return
}
if le == nil {
2018-04-05 15:49:21 +00:00
tidyErrors = multierror . Append ( tidyErrors , errwrap . Wrapf ( fmt . Sprintf ( "nil entry for lease ID %q: {{err}}" , leaseID ) , err ) )
2017-03-07 20:22:21 +00:00
return
}
2017-05-02 21:11:35 +00:00
var isValid , ok bool
2017-04-26 19:48:28 +00:00
revokeLease := false
2017-03-07 20:22:21 +00:00
if le . ClientToken == "" {
2018-06-16 22:21:33 +00:00
logger . Debug ( "revoking lease which has an empty token" , "lease_id" , leaseID )
2017-04-26 19:48:28 +00:00
revokeLease = true
2017-05-03 14:54:07 +00:00
deletedCountEmptyToken ++
2017-05-02 20:53:41 +00:00
goto REVOKE_CHECK
2017-03-07 20:22:21 +00:00
}
2017-05-02 20:53:41 +00:00
isValid , ok = tokenCache [ le . ClientToken ]
2017-04-27 15:31:42 +00:00
if ! ok {
lock := locksutil . LockForKey ( m . tokenStore . tokenLocks , le . ClientToken )
lock . RLock ( )
2018-09-18 03:03:00 +00:00
te , err := m . tokenStore . lookupInternal ( ctx , le . ClientToken , false , true )
2017-04-27 15:31:42 +00:00
lock . RUnlock ( )
2017-04-26 20:54:48 +00:00
2017-04-27 15:31:42 +00:00
if err != nil {
2018-04-05 15:49:21 +00:00
tidyErrors = multierror . Append ( tidyErrors , errwrap . Wrapf ( "failed to lookup token: {{err}}" , err ) )
2017-04-27 15:31:42 +00:00
return
}
2017-04-27 15:08:11 +00:00
2017-04-27 15:31:42 +00:00
if te == nil {
2018-06-16 22:21:33 +00:00
logger . Debug ( "revoking lease which holds an invalid token" , "lease_id" , leaseID )
2017-04-27 15:31:42 +00:00
revokeLease = true
2017-05-03 14:54:07 +00:00
deletedCountInvalidToken ++
2017-04-27 15:31:42 +00:00
tokenCache [ le . ClientToken ] = false
} else {
tokenCache [ le . ClientToken ] = true
}
2017-05-05 14:26:40 +00:00
goto REVOKE_CHECK
2017-04-27 15:08:11 +00:00
} else {
2017-04-27 15:31:42 +00:00
if isValid {
return
}
2017-09-05 15:09:00 +00:00
2018-06-16 22:21:33 +00:00
logger . Debug ( "revoking lease which contains an invalid token" , "lease_id" , leaseID )
2017-09-05 15:09:00 +00:00
revokeLease = true
deletedCountInvalidToken ++
2017-05-05 14:26:40 +00:00
goto REVOKE_CHECK
2017-04-26 19:48:28 +00:00
}
2017-05-02 20:53:41 +00:00
REVOKE_CHECK :
2017-04-26 19:48:28 +00:00
if revokeLease {
2017-03-07 20:22:21 +00:00
// Force the revocation and skip going through the token store
// again
2018-09-18 03:03:00 +00:00
err = m . revokeCommon ( ctx , leaseID , true , true )
2017-03-07 20:22:21 +00:00
if err != nil {
2018-04-05 15:49:21 +00:00
tidyErrors = multierror . Append ( tidyErrors , errwrap . Wrapf ( fmt . Sprintf ( "failed to revoke an invalid lease with ID %q: {{err}}" , leaseID ) , err ) )
2017-03-07 20:22:21 +00:00
return
}
2017-05-05 14:48:12 +00:00
revokedCount ++
2017-03-07 20:22:21 +00:00
}
}
2018-09-18 03:03:00 +00:00
ns , err := namespace . FromContext ( ctx )
if err != nil {
return err
}
leaseView := m . leaseView ( ns )
if err := logical . ScanView ( m . quitContext , leaseView , tidyFunc ) ; err != nil {
2017-04-27 20:22:19 +00:00
return err
2017-03-07 20:22:21 +00:00
}
2018-06-16 22:21:33 +00:00
logger . Info ( "number of leases scanned" , "count" , countLease )
logger . Info ( "number of leases which had empty tokens" , "count" , deletedCountEmptyToken )
logger . Info ( "number of leases which had invalid tokens" , "count" , deletedCountInvalidToken )
logger . Info ( "number of leases successfully revoked" , "count" , revokedCount )
2017-05-03 14:54:07 +00:00
2017-04-27 17:48:29 +00:00
return tidyErrors . ErrorOrNil ( )
2017-03-07 20:22:21 +00:00
}
2015-03-13 18:20:36 +00:00
// Restore is used to recover the lease states when starting.
// This is used after starting the vault.
2017-09-11 18:49:08 +00:00
func ( m * ExpirationManager ) Restore ( errorFunc func ( ) ) ( retErr error ) {
2017-09-05 15:09:00 +00:00
defer func ( ) {
// Turn off restore mode. We can do this safely without the lock because
// if restore mode finished successfully, restore mode was already
// disabled with the lock. In an error state, this will allow the
// Stop() function to shut everything down.
2018-06-09 19:35:22 +00:00
atomic . StoreInt32 ( m . restoreMode , 0 )
2017-09-05 15:09:00 +00:00
switch {
case retErr == nil :
2018-08-17 20:06:47 +00:00
case strings . Contains ( retErr . Error ( ) , context . Canceled . Error ( ) ) :
2018-07-13 18:30:08 +00:00
// Don't run error func because we lost leadership
2019-11-07 22:10:47 +00:00
m . logger . Warn ( "context canceled while restoring leases, stopping lease loading" )
2018-07-13 18:30:08 +00:00
retErr = nil
2017-09-05 15:09:00 +00:00
case errwrap . Contains ( retErr , ErrBarrierSealed . Error ( ) ) :
// Don't run error func because we're likely already shutting down
2018-04-03 00:46:59 +00:00
m . logger . Warn ( "barrier sealed while restoring leases, stopping lease loading" )
2017-09-05 15:09:00 +00:00
retErr = nil
default :
2018-04-03 00:46:59 +00:00
m . logger . Error ( "error restoring leases" , "error" , retErr )
2017-09-05 15:09:00 +00:00
if errorFunc != nil {
errorFunc ( )
}
}
} ( )
2015-03-16 18:33:59 +00:00
// Accumulate existing leases
2018-04-03 00:46:59 +00:00
m . logger . Debug ( "collecting leases" )
2018-09-18 03:03:00 +00:00
existing , leaseCount , err := m . collectLeases ( )
2015-03-18 19:03:33 +00:00
if err != nil {
2018-09-18 03:03:00 +00:00
return err
2015-03-16 18:33:59 +00:00
}
2018-09-18 03:03:00 +00:00
m . logger . Debug ( "leases collected" , "num_existing" , leaseCount )
2017-02-06 23:30:13 +00:00
2017-02-16 18:16:06 +00:00
// Make the channels used for the worker pool
2018-09-18 03:03:00 +00:00
type lease struct {
namespace * namespace . Namespace
id string
}
broker := make ( chan * lease )
2017-02-16 18:16:06 +00:00
quit := make ( chan bool )
// Buffer these channels to prevent deadlocks
errs := make ( chan error , len ( existing ) )
2017-09-05 15:09:00 +00:00
result := make ( chan struct { } , len ( existing ) )
2017-02-16 18:16:06 +00:00
// Use a wait group
wg := & sync . WaitGroup { }
// Create 64 workers to distribute work to
for i := 0 ; i < consts . ExpirationRestoreWorkerCount ; i ++ {
wg . Add ( 1 )
go func ( ) {
defer wg . Done ( )
for {
select {
2018-09-18 03:03:00 +00:00
case lease , ok := <- broker :
2017-02-16 18:16:06 +00:00
// broker has been closed, we are done
if ! ok {
return
}
2018-09-18 03:03:00 +00:00
ctx := namespace . ContextWithNamespace ( m . quitContext , lease . namespace )
err := m . processRestore ( ctx , lease . id )
2017-02-16 18:16:06 +00:00
if err != nil {
errs <- err
continue
}
2017-09-05 15:09:00 +00:00
// Send message that lease is done
result <- struct { } { }
2017-02-16 18:16:06 +00:00
// quit early
case <- quit :
return
2017-09-05 15:09:00 +00:00
case <- m . quitCh :
return
2017-02-16 18:16:06 +00:00
}
}
} ( )
}
// Distribute the collected keys to the workers in a go routine
wg . Add ( 1 )
go func ( ) {
defer wg . Done ( )
2018-09-18 03:03:00 +00:00
i := 0
for ns := range existing {
for _ , leaseID := range existing [ ns ] {
i ++
if i % 500 == 0 {
m . logger . Debug ( "leases loading" , "progress" , i )
}
2017-02-16 18:16:06 +00:00
2018-09-18 03:03:00 +00:00
select {
case <- quit :
return
2017-02-16 18:16:06 +00:00
2018-09-18 03:03:00 +00:00
case <- m . quitCh :
return
2017-09-05 15:09:00 +00:00
2018-09-18 03:03:00 +00:00
default :
broker <- & lease {
namespace : ns ,
id : leaseID ,
}
}
2017-02-16 18:16:06 +00:00
}
2017-02-06 23:30:13 +00:00
}
2017-02-16 18:16:06 +00:00
// Close the broker, causing worker routines to exit
close ( broker )
} ( )
2017-09-05 15:09:00 +00:00
// Ensure all keys on the chan are processed
2018-09-18 03:03:00 +00:00
for i := 0 ; i < leaseCount ; i ++ {
2017-02-16 18:16:06 +00:00
select {
case err := <- errs :
// Close all go routines
close ( quit )
2015-03-16 18:33:59 +00:00
return err
2017-09-05 15:09:00 +00:00
case <- m . quitCh :
close ( quit )
return nil
2017-02-16 18:16:06 +00:00
2017-09-05 15:09:00 +00:00
case <- result :
2017-02-16 18:16:06 +00:00
}
2015-03-16 18:33:59 +00:00
}
2017-02-16 18:16:06 +00:00
// Let all go routines finish
wg . Wait ( )
2017-09-05 15:09:00 +00:00
m . restoreModeLock . Lock ( )
2018-06-09 19:35:22 +00:00
atomic . StoreInt32 ( m . restoreMode , 0 )
2018-10-19 19:21:42 +00:00
m . restoreLoaded . Range ( func ( k , v interface { } ) bool {
m . restoreLoaded . Delete ( k )
return true
} )
m . restoreLocks = nil
2017-09-05 15:09:00 +00:00
m . restoreModeLock . Unlock ( )
2018-04-03 00:46:59 +00:00
m . logger . Info ( "lease restore complete" )
2017-09-05 15:09:00 +00:00
return nil
}
// processRestore takes a lease and restores it in the expiration manager if it has
// not already been seen
2018-09-18 03:03:00 +00:00
func ( m * ExpirationManager ) processRestore ( ctx context . Context , leaseID string ) error {
2017-09-05 15:09:00 +00:00
m . restoreRequestLock . RLock ( )
defer m . restoreRequestLock . RUnlock ( )
// Check if the lease has been seen
if _ , ok := m . restoreLoaded . Load ( leaseID ) ; ok {
return nil
}
m . lockLease ( leaseID )
defer m . unlockLease ( leaseID )
// Check again with the lease locked
if _ , ok := m . restoreLoaded . Load ( leaseID ) ; ok {
return nil
2015-03-24 00:27:46 +00:00
}
2017-02-16 18:16:06 +00:00
2017-09-05 15:09:00 +00:00
// Load lease and restore expiration timer
2018-09-18 03:03:00 +00:00
_ , err := m . loadEntryInternal ( ctx , leaseID , true , false )
2017-09-05 15:09:00 +00:00
if err != nil {
return err
}
2015-03-13 18:20:36 +00:00
return nil
}
// Stop is used to prevent further automatic revocations.
// This must be called before sealing the view.
func ( m * ExpirationManager ) Stop ( ) error {
2015-03-16 01:06:19 +00:00
// Stop all the pending expiration timers
2018-04-03 00:46:59 +00:00
m . logger . Debug ( "stop triggered" )
defer m . logger . Debug ( "finished stopping" )
2017-09-05 15:09:00 +00:00
Vault-1403 Switch Expiration Manager to use Fairsharing Backpressure (#1709) (#10932)
* basic pool and start testing
* refactor a bit for testing
* workFunc, start/stop safety, testing
* cleanup function for worker quit, more tests
* redo public/private members
* improve tests, export types, switch uuid package
* fix loop capture bug, cleanup
* cleanup tests
* update worker pool file name, other improvements
* add job manager prototype
* remove remnants
* add functions to wait for job manager and worker pool to stop, other fixes
* test job manager functionality, fix bugs
* encapsulate how jobs are distributed to workers
* make worker job channel read only
* add job interface, more testing, fixes
* set name for dispatcher
* fix test races
* wire up expiration manager most of the way
* dispatcher and job manager constructors don't return errors
* logger now dependency injected
* make some members private, test fcn to get worker pool size
* make GetNumWorkers public
* Update helper/fairshare/jobmanager_test.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* update fairsharing usage, add tests
* make workerpool private
* remove custom worker names
* concurrency improvements
* remove worker pool cleanup function
* remove cleanup func from job manager, remove non blocking stop from fairshare
* update job manager for new constructor
* stop job manager when expiration manager stopped
* unset env var after test
* stop fairshare when started in tests
* stop leaking job manager goroutine
* prototype channel for waking up to assign work
* fix typo/bug and add tests
* improve job manager wake up, fix test typo
* put channel drain back
* better start/pause test for job manager
* comment cleanup
* degrade possible noisy log
* remove closure, clean up context
* improve revocation context timer
* test: reduce number of revocation workers during many tests
* Update vault/expiration.go
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
* feedback tweaks
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
Co-authored-by: Brian Kassouf <briankassouf@users.noreply.github.com>
2021-02-17 22:30:27 +00:00
m . jobManager . Stop ( )
2017-12-01 22:08:38 +00:00
// Do this before stopping pending timers to avoid potential races with
// expiring timers
close ( m . quitCh )
2015-03-16 01:06:19 +00:00
m . pendingLock . Lock ( )
2020-06-15 23:54:36 +00:00
// Replacing the entire map would cause a race with
// a simultaneous WalkTokens, which doesn't hold pendingLock.
2020-05-21 17:41:03 +00:00
m . pending . Range ( func ( key , value interface { } ) bool {
info := value . ( pendingInfo )
info . timer . Stop ( )
m . pending . Delete ( key )
return true
} )
2020-10-08 00:27:45 +00:00
m . leaseCount = 0
2020-06-15 23:54:36 +00:00
m . nonexpiring . Range ( func ( key , value interface { } ) bool {
m . nonexpiring . Delete ( key )
return true
} )
m . uniquePolicies = make ( map [ string ] [ ] string )
2021-04-29 15:12:02 +00:00
m . zombies . Range ( func ( key , _ interface { } ) bool {
m . zombies . Delete ( key )
return true
} )
2015-03-16 01:06:19 +00:00
m . pendingLock . Unlock ( )
2017-09-05 15:09:00 +00:00
if m . inRestoreMode ( ) {
for {
if ! m . inRestoreMode ( ) {
break
}
time . Sleep ( 10 * time . Millisecond )
}
}
2020-06-15 23:54:36 +00:00
m . emptyUniquePolicies . Stop ( )
2015-03-12 19:44:22 +00:00
return nil
}
2015-03-13 01:38:15 +00:00
2015-04-08 20:35:32 +00:00
// Revoke is used to revoke a secret named by the given LeaseID
2018-07-24 21:50:49 +00:00
func ( m * ExpirationManager ) Revoke ( ctx context . Context , leaseID string ) error {
2015-04-08 23:43:17 +00:00
defer metrics . MeasureSince ( [ ] string { "expire" , "revoke" } , time . Now ( ) )
Add forced revocation.
In some situations, it can be impossible to revoke leases (for instance,
if someone has gone and manually removed users created by Vault). This
can not only cause Vault to cycle trying to revoke them, but it also
prevents mounts from being unmounted, leaving them in a tainted state
where the only operations allowed are to revoke (or rollback), which
will never successfully complete.
This adds a new endpoint that works similarly to `revoke-prefix` but
ignores errors coming from a backend upon revocation (it does not ignore
errors coming from within the expiration manager, such as errors
accessing the data store). This can be used to force Vault to abandon
leases.
Like `revoke-prefix`, this is a very sensitive operation and requires
`sudo`. It is implemented as a separate endpoint, rather than an
argument to `revoke-prefix`, to ensure that control can be delegated
appropriately, as even most administrators should not normally have
this privilege.
Fixes #1135
2016-03-03 01:26:38 +00:00
2018-07-24 21:50:49 +00:00
return m . revokeCommon ( ctx , leaseID , false , false )
Add forced revocation.
In some situations, it can be impossible to revoke leases (for instance,
if someone has gone and manually removed users created by Vault). This
can not only cause Vault to cycle trying to revoke them, but it also
prevents mounts from being unmounted, leaving them in a tainted state
where the only operations allowed are to revoke (or rollback), which
will never successfully complete.
This adds a new endpoint that works similarly to `revoke-prefix` but
ignores errors coming from a backend upon revocation (it does not ignore
errors coming from within the expiration manager, such as errors
accessing the data store). This can be used to force Vault to abandon
leases.
Like `revoke-prefix`, this is a very sensitive operation and requires
`sudo`. It is implemented as a separate endpoint, rather than an
argument to `revoke-prefix`, to ensure that control can be delegated
appropriately, as even most administrators should not normally have
this privilege.
Fixes #1135
2016-03-03 01:26:38 +00:00
}
2018-07-11 19:45:09 +00:00
// LazyRevoke is used to queue revocation for a secret named by the given
// LeaseID. If the lease was not found it returns nil; if the lease was found
// it triggers a return of a 202.
2018-08-02 01:39:39 +00:00
func ( m * ExpirationManager ) LazyRevoke ( ctx context . Context , leaseID string ) error {
2018-07-11 19:45:09 +00:00
defer metrics . MeasureSince ( [ ] string { "expire" , "lazy-revoke" } , time . Now ( ) )
// Load the entry
2018-08-02 01:39:39 +00:00
le , err := m . loadEntry ( ctx , leaseID )
2018-07-11 19:45:09 +00:00
if err != nil {
return err
}
// If there is no entry, nothing to revoke
if le == nil {
return nil
}
le . ExpireTime = time . Now ( )
{
m . pendingLock . Lock ( )
2018-08-02 01:39:39 +00:00
if err := m . persistEntry ( ctx , le ) ; err != nil {
2018-07-11 19:45:09 +00:00
m . pendingLock . Unlock ( )
return err
}
2020-09-23 18:46:22 +00:00
m . updatePendingInternal ( le )
2018-07-11 19:45:09 +00:00
m . pendingLock . Unlock ( )
}
return nil
}
Add forced revocation.
In some situations, it can be impossible to revoke leases (for instance,
if someone has gone and manually removed users created by Vault). This
can not only cause Vault to cycle trying to revoke them, but it also
prevents mounts from being unmounted, leaving them in a tainted state
where the only operations allowed are to revoke (or rollback), which
will never successfully complete.
This adds a new endpoint that works similarly to `revoke-prefix` but
ignores errors coming from a backend upon revocation (it does not ignore
errors coming from within the expiration manager, such as errors
accessing the data store). This can be used to force Vault to abandon
leases.
Like `revoke-prefix`, this is a very sensitive operation and requires
`sudo`. It is implemented as a separate endpoint, rather than an
argument to `revoke-prefix`, to ensure that control can be delegated
appropriately, as even most administrators should not normally have
this privilege.
Fixes #1135
2016-03-03 01:26:38 +00:00
// revokeCommon does the heavy lifting. If force is true, we ignore a problem
// during revocation and still remove entries/index/lease timers
2018-07-24 21:50:49 +00:00
func ( m * ExpirationManager ) revokeCommon ( ctx context . Context , leaseID string , force , skipToken bool ) error {
Add forced revocation.
In some situations, it can be impossible to revoke leases (for instance,
if someone has gone and manually removed users created by Vault). This
can not only cause Vault to cycle trying to revoke them, but it also
prevents mounts from being unmounted, leaving them in a tainted state
where the only operations allowed are to revoke (or rollback), which
will never successfully complete.
This adds a new endpoint that works similarly to `revoke-prefix` but
ignores errors coming from a backend upon revocation (it does not ignore
errors coming from within the expiration manager, such as errors
accessing the data store). This can be used to force Vault to abandon
leases.
Like `revoke-prefix`, this is a very sensitive operation and requires
`sudo`. It is implemented as a separate endpoint, rather than an
argument to `revoke-prefix`, to ensure that control can be delegated
appropriately, as even most administrators should not normally have
this privilege.
Fixes #1135
2016-03-03 01:26:38 +00:00
defer metrics . MeasureSince ( [ ] string { "expire" , "revoke-common" } , time . Now ( ) )
2017-09-05 15:09:00 +00:00
2015-03-16 18:33:59 +00:00
// Load the entry
2018-08-02 01:39:39 +00:00
le , err := m . loadEntry ( ctx , leaseID )
2015-03-16 18:33:59 +00:00
if err != nil {
return err
}
// If there is no entry, nothing to revoke
if le == nil {
return nil
}
// Revoke the entry
2016-03-31 19:10:25 +00:00
if ! skipToken || le . Auth == nil {
2018-08-02 01:39:39 +00:00
if err := m . revokeEntry ( ctx , le ) ; err != nil {
2016-03-31 19:10:25 +00:00
if ! force {
return err
2017-09-05 15:09:00 +00:00
}
if m . logger . IsWarn ( ) {
m . logger . Warn ( "revocation from the backend failed, but in force mode so ignoring" , "error" , err )
2016-03-31 19:10:25 +00:00
}
2016-03-08 16:05:46 +00:00
}
2015-03-16 18:33:59 +00:00
}
// Delete the entry
2018-09-18 03:03:00 +00:00
if err := m . deleteEntry ( ctx , le ) ; err != nil {
2015-03-16 18:33:59 +00:00
return err
}
2016-12-16 18:11:55 +00:00
// Delete the secondary index, but only if it's a leased secret (not auth)
if le . Secret != nil {
2021-04-16 21:03:22 +00:00
var indexToken string
// Maintain secondary index by token, except for orphan batch tokens
switch le . ClientTokenType {
case logical . TokenTypeBatch :
te , err := m . tokenStore . lookupBatchTokenInternal ( ctx , le . ClientToken )
if err != nil {
return err
}
// If it's a non-orphan batch token, assign the secondary index to its
// parent
indexToken = te . Parent
default :
indexToken = le . ClientToken
}
if indexToken != "" {
if err := m . removeIndexByToken ( ctx , le , indexToken ) ; err != nil {
return err
}
2016-12-16 18:11:55 +00:00
}
2015-04-10 21:48:08 +00:00
}
2020-06-26 21:13:16 +00:00
// Clear the expiration handler
2015-03-16 18:33:59 +00:00
m . pendingLock . Lock ( )
2021-04-29 15:12:02 +00:00
m . removeFromPending ( ctx , leaseID )
2020-06-15 23:54:36 +00:00
m . nonexpiring . Delete ( leaseID )
2021-04-29 15:12:02 +00:00
m . zombies . Delete ( leaseID )
2015-03-16 18:33:59 +00:00
m . pendingLock . Unlock ( )
2018-03-20 15:46:27 +00:00
2018-04-04 07:07:10 +00:00
if m . logger . IsInfo ( ) && ! skipToken && m . logLeaseExpirations {
2018-04-03 00:46:59 +00:00
m . logger . Info ( "revoked lease" , "lease_id" , leaseID )
2018-03-20 15:46:27 +00:00
}
2015-03-13 01:38:15 +00:00
return nil
}
Add forced revocation.
In some situations, it can be impossible to revoke leases (for instance,
if someone has gone and manually removed users created by Vault). This
can not only cause Vault to cycle trying to revoke them, but it also
prevents mounts from being unmounted, leaving them in a tainted state
where the only operations allowed are to revoke (or rollback), which
will never successfully complete.
This adds a new endpoint that works similarly to `revoke-prefix` but
ignores errors coming from a backend upon revocation (it does not ignore
errors coming from within the expiration manager, such as errors
accessing the data store). This can be used to force Vault to abandon
leases.
Like `revoke-prefix`, this is a very sensitive operation and requires
`sudo`. It is implemented as a separate endpoint, rather than an
argument to `revoke-prefix`, to ensure that control can be delegated
appropriately, as even most administrators should not normally have
this privilege.
Fixes #1135
2016-03-03 01:26:38 +00:00
// RevokeForce works similarly to RevokePrefix but continues in the case of a
// revocation error; this is mostly meant for recovery operations
2018-08-02 01:39:39 +00:00
func ( m * ExpirationManager ) RevokeForce ( ctx context . Context , prefix string ) error {
Add forced revocation.
In some situations, it can be impossible to revoke leases (for instance,
if someone has gone and manually removed users created by Vault). This
can not only cause Vault to cycle trying to revoke them, but it also
prevents mounts from being unmounted, leaving them in a tainted state
where the only operations allowed are to revoke (or rollback), which
will never successfully complete.
This adds a new endpoint that works similarly to `revoke-prefix` but
ignores errors coming from a backend upon revocation (it does not ignore
errors coming from within the expiration manager, such as errors
accessing the data store). This can be used to force Vault to abandon
leases.
Like `revoke-prefix`, this is a very sensitive operation and requires
`sudo`. It is implemented as a separate endpoint, rather than an
argument to `revoke-prefix`, to ensure that control can be delegated
appropriately, as even most administrators should not normally have
this privilege.
Fixes #1135
2016-03-03 01:26:38 +00:00
defer metrics . MeasureSince ( [ ] string { "expire" , "revoke-force" } , time . Now ( ) )
2018-08-02 01:39:39 +00:00
return m . revokePrefixCommon ( ctx , prefix , true , true )
Add forced revocation.
In some situations, it can be impossible to revoke leases (for instance,
if someone has gone and manually removed users created by Vault). This
can not only cause Vault to cycle trying to revoke them, but it also
prevents mounts from being unmounted, leaving them in a tainted state
where the only operations allowed are to revoke (or rollback), which
will never successfully complete.
This adds a new endpoint that works similarly to `revoke-prefix` but
ignores errors coming from a backend upon revocation (it does not ignore
errors coming from within the expiration manager, such as errors
accessing the data store). This can be used to force Vault to abandon
leases.
Like `revoke-prefix`, this is a very sensitive operation and requires
`sudo`. It is implemented as a separate endpoint, rather than an
argument to `revoke-prefix`, to ensure that control can be delegated
appropriately, as even most administrators should not normally have
this privilege.
Fixes #1135
2016-03-03 01:26:38 +00:00
}
2015-03-13 01:38:15 +00:00
// RevokePrefix is used to revoke all secrets with a given prefix.
// The prefix maps to that of the mount table to make this simpler
// to reason about.
2018-08-02 01:39:39 +00:00
func ( m * ExpirationManager ) RevokePrefix ( ctx context . Context , prefix string , sync bool ) error {
2015-04-08 23:43:17 +00:00
defer metrics . MeasureSince ( [ ] string { "expire" , "revoke-prefix" } , time . Now ( ) )
2015-03-16 21:59:37 +00:00
2018-08-02 01:39:39 +00:00
return m . revokePrefixCommon ( ctx , prefix , false , sync )
Add forced revocation.
In some situations, it can be impossible to revoke leases (for instance,
if someone has gone and manually removed users created by Vault). This
can not only cause Vault to cycle trying to revoke them, but it also
prevents mounts from being unmounted, leaving them in a tainted state
where the only operations allowed are to revoke (or rollback), which
will never successfully complete.
This adds a new endpoint that works similarly to `revoke-prefix` but
ignores errors coming from a backend upon revocation (it does not ignore
errors coming from within the expiration manager, such as errors
accessing the data store). This can be used to force Vault to abandon
leases.
Like `revoke-prefix`, this is a very sensitive operation and requires
`sudo`. It is implemented as a separate endpoint, rather than an
argument to `revoke-prefix`, to ensure that control can be delegated
appropriately, as even most administrators should not normally have
this privilege.
Fixes #1135
2016-03-03 01:26:38 +00:00
}
2016-03-31 19:10:25 +00:00
// RevokeByToken is used to revoke all the secrets issued with a given token.
// This is done by using the secondary index. It also removes the lease entry
// for the token itself. As a result it should *ONLY* ever be called from the
// token store's revokeSalted function.
2018-08-02 01:39:39 +00:00
func ( m * ExpirationManager ) RevokeByToken ( ctx context . Context , te * logical . TokenEntry ) error {
Add forced revocation.
In some situations, it can be impossible to revoke leases (for instance,
if someone has gone and manually removed users created by Vault). This
can not only cause Vault to cycle trying to revoke them, but it also
prevents mounts from being unmounted, leaving them in a tainted state
where the only operations allowed are to revoke (or rollback), which
will never successfully complete.
This adds a new endpoint that works similarly to `revoke-prefix` but
ignores errors coming from a backend upon revocation (it does not ignore
errors coming from within the expiration manager, such as errors
accessing the data store). This can be used to force Vault to abandon
leases.
Like `revoke-prefix`, this is a very sensitive operation and requires
`sudo`. It is implemented as a separate endpoint, rather than an
argument to `revoke-prefix`, to ensure that control can be delegated
appropriately, as even most administrators should not normally have
this privilege.
Fixes #1135
2016-03-03 01:26:38 +00:00
defer metrics . MeasureSince ( [ ] string { "expire" , "revoke-by-token" } , time . Now ( ) )
2018-09-18 03:03:00 +00:00
tokenNS , err := NamespaceByID ( ctx , te . NamespaceID , m . core )
if err != nil {
return err
}
if tokenNS == nil {
return namespace . ErrNoNamespace
}
2017-09-05 15:09:00 +00:00
2018-09-18 03:03:00 +00:00
tokenCtx := namespace . ContextWithNamespace ( ctx , tokenNS )
Add forced revocation.
In some situations, it can be impossible to revoke leases (for instance,
if someone has gone and manually removed users created by Vault). This
can not only cause Vault to cycle trying to revoke them, but it also
prevents mounts from being unmounted, leaving them in a tainted state
where the only operations allowed are to revoke (or rollback), which
will never successfully complete.
This adds a new endpoint that works similarly to `revoke-prefix` but
ignores errors coming from a backend upon revocation (it does not ignore
errors coming from within the expiration manager, such as errors
accessing the data store). This can be used to force Vault to abandon
leases.
Like `revoke-prefix`, this is a very sensitive operation and requires
`sudo`. It is implemented as a separate endpoint, rather than an
argument to `revoke-prefix`, to ensure that control can be delegated
appropriately, as even most administrators should not normally have
this privilege.
Fixes #1135
2016-03-03 01:26:38 +00:00
// Lookup the leases
2018-09-18 03:03:00 +00:00
existing , err := m . lookupLeasesByToken ( tokenCtx , te )
2015-03-18 19:03:33 +00:00
if err != nil {
2018-04-05 15:49:21 +00:00
return errwrap . Wrapf ( "failed to scan for leases: {{err}}" , err )
2015-03-16 18:33:59 +00:00
}
// Revoke all the keys
2018-05-10 19:50:02 +00:00
for _ , leaseID := range existing {
// Load the entry
2018-08-02 01:39:39 +00:00
le , err := m . loadEntry ( ctx , leaseID )
2018-05-10 19:50:02 +00:00
if err != nil {
return err
}
// If there's a lease, set expiration to now, persist, and call
// updatePending to hand off revocation to the expiration manager's pending
// timer map
if le != nil {
le . ExpireTime = time . Now ( )
2018-06-08 21:24:44 +00:00
{
m . pendingLock . Lock ( )
2018-08-02 01:39:39 +00:00
if err := m . persistEntry ( ctx , le ) ; err != nil {
2018-06-08 21:24:44 +00:00
m . pendingLock . Unlock ( )
return err
}
2018-05-10 19:50:02 +00:00
2020-09-23 18:46:22 +00:00
m . updatePendingInternal ( le )
2018-06-08 21:24:44 +00:00
m . pendingLock . Unlock ( )
}
2015-03-16 18:33:59 +00:00
}
}
2016-03-31 19:10:25 +00:00
2018-05-10 19:50:02 +00:00
// te.Path should never be empty, but we check just in case
2016-12-16 20:29:27 +00:00
if te . Path != "" {
2018-09-18 03:03:00 +00:00
saltCtx := namespace . ContextWithNamespace ( ctx , tokenNS )
saltedID , err := m . tokenStore . SaltID ( saltCtx , te . ID )
2017-07-18 16:02:03 +00:00
if err != nil {
return err
}
tokenLeaseID := path . Join ( te . Path , saltedID )
2016-12-16 20:29:27 +00:00
2018-09-18 03:03:00 +00:00
if tokenNS . ID != namespace . RootNamespaceID {
tokenLeaseID = fmt . Sprintf ( "%s.%s" , tokenLeaseID , tokenNS . ID )
}
2016-12-16 20:29:27 +00:00
// We want to skip the revokeEntry call as that will call back into
// revocation logic in the token store, which is what is running this
// function in the first place -- it'd be a deadlock loop. Since the only
// place that this function is called is revokeSalted in the token store,
// we're already revoking the token, so we just want to clean up the lease.
// This avoids spurious revocations later in the log when the timer runs
// out, and eases up resource usage.
2018-08-02 01:39:39 +00:00
return m . revokeCommon ( ctx , tokenLeaseID , false , true )
2016-12-16 20:29:27 +00:00
}
2016-03-31 19:10:25 +00:00
2016-12-16 20:29:27 +00:00
return nil
2015-03-13 01:38:15 +00:00
}
2018-08-02 01:39:39 +00:00
func ( m * ExpirationManager ) revokePrefixCommon ( ctx context . Context , prefix string , force , sync bool ) error {
2017-09-05 15:09:00 +00:00
if m . inRestoreMode ( ) {
m . restoreRequestLock . Lock ( )
defer m . restoreRequestLock . Unlock ( )
}
2018-04-26 20:26:07 +00:00
// Ensure there is a trailing slash; or, if there is no slash, see if there
// is a matching specific ID
Add forced revocation.
In some situations, it can be impossible to revoke leases (for instance,
if someone has gone and manually removed users created by Vault). This
can not only cause Vault to cycle trying to revoke them, but it also
prevents mounts from being unmounted, leaving them in a tainted state
where the only operations allowed are to revoke (or rollback), which
will never successfully complete.
This adds a new endpoint that works similarly to `revoke-prefix` but
ignores errors coming from a backend upon revocation (it does not ignore
errors coming from within the expiration manager, such as errors
accessing the data store). This can be used to force Vault to abandon
leases.
Like `revoke-prefix`, this is a very sensitive operation and requires
`sudo`. It is implemented as a separate endpoint, rather than an
argument to `revoke-prefix`, to ensure that control can be delegated
appropriately, as even most administrators should not normally have
this privilege.
Fixes #1135
2016-03-03 01:26:38 +00:00
if ! strings . HasSuffix ( prefix , "/" ) {
2018-08-02 01:39:39 +00:00
le , err := m . loadEntry ( ctx , prefix )
2018-04-26 20:26:07 +00:00
if err == nil && le != nil {
2018-07-11 19:45:09 +00:00
if sync {
2018-08-02 01:39:39 +00:00
if err := m . revokeCommon ( ctx , prefix , force , false ) ; err != nil {
2018-07-11 19:45:09 +00:00
return errwrap . Wrapf ( fmt . Sprintf ( "failed to revoke %q: {{err}}" , prefix ) , err )
}
return nil
2018-04-26 20:26:07 +00:00
}
2018-08-02 01:39:39 +00:00
return m . LazyRevoke ( ctx , prefix )
2018-04-26 20:26:07 +00:00
}
2018-06-03 22:14:51 +00:00
prefix = prefix + "/"
Add forced revocation.
In some situations, it can be impossible to revoke leases (for instance,
if someone has gone and manually removed users created by Vault). This
can not only cause Vault to cycle trying to revoke them, but it also
prevents mounts from being unmounted, leaving them in a tainted state
where the only operations allowed are to revoke (or rollback), which
will never successfully complete.
This adds a new endpoint that works similarly to `revoke-prefix` but
ignores errors coming from a backend upon revocation (it does not ignore
errors coming from within the expiration manager, such as errors
accessing the data store). This can be used to force Vault to abandon
leases.
Like `revoke-prefix`, this is a very sensitive operation and requires
`sudo`. It is implemented as a separate endpoint, rather than an
argument to `revoke-prefix`, to ensure that control can be delegated
appropriately, as even most administrators should not normally have
this privilege.
Fixes #1135
2016-03-03 01:26:38 +00:00
}
// Accumulate existing leases
2018-09-18 03:03:00 +00:00
ns , err := namespace . FromContext ( ctx )
if err != nil {
return err
}
view := m . leaseView ( ns )
sub := view . SubView ( prefix )
2018-08-02 01:39:39 +00:00
existing , err := logical . CollectKeys ( ctx , sub )
2015-04-10 21:48:08 +00:00
if err != nil {
2018-04-05 15:49:21 +00:00
return errwrap . Wrapf ( "failed to scan for leases: {{err}}" , err )
2015-04-10 21:48:08 +00:00
}
// Revoke all the keys
Add forced revocation.
In some situations, it can be impossible to revoke leases (for instance,
if someone has gone and manually removed users created by Vault). This
can not only cause Vault to cycle trying to revoke them, but it also
prevents mounts from being unmounted, leaving them in a tainted state
where the only operations allowed are to revoke (or rollback), which
will never successfully complete.
This adds a new endpoint that works similarly to `revoke-prefix` but
ignores errors coming from a backend upon revocation (it does not ignore
errors coming from within the expiration manager, such as errors
accessing the data store). This can be used to force Vault to abandon
leases.
Like `revoke-prefix`, this is a very sensitive operation and requires
`sudo`. It is implemented as a separate endpoint, rather than an
argument to `revoke-prefix`, to ensure that control can be delegated
appropriately, as even most administrators should not normally have
this privilege.
Fixes #1135
2016-03-03 01:26:38 +00:00
for idx , suffix := range existing {
leaseID := prefix + suffix
2018-07-11 19:45:09 +00:00
switch {
case sync :
2018-08-02 01:39:39 +00:00
if err := m . revokeCommon ( ctx , leaseID , force , false ) ; err != nil {
2018-07-11 19:45:09 +00:00
return errwrap . Wrapf ( fmt . Sprintf ( "failed to revoke %q (%d / %d): {{err}}" , leaseID , idx + 1 , len ( existing ) ) , err )
}
default :
2018-08-02 01:39:39 +00:00
if err := m . LazyRevoke ( ctx , leaseID ) ; err != nil {
2018-07-11 19:45:09 +00:00
return errwrap . Wrapf ( fmt . Sprintf ( "failed to revoke %q (%d / %d): {{err}}" , leaseID , idx + 1 , len ( existing ) ) , err )
}
2015-04-10 21:48:08 +00:00
}
}
2018-07-11 19:45:09 +00:00
2015-04-10 21:48:08 +00:00
return nil
}
2015-04-08 20:35:32 +00:00
// Renew is used to renew a secret using the given leaseID
2015-03-13 01:38:15 +00:00
// and a renew interval. The increment may be ignored.
2018-08-02 01:39:39 +00:00
func ( m * ExpirationManager ) Renew ( ctx context . Context , leaseID string , increment time . Duration ) ( * logical . Response , error ) {
2015-04-08 23:43:17 +00:00
defer metrics . MeasureSince ( [ ] string { "expire" , "renew" } , time . Now ( ) )
2017-09-05 15:09:00 +00:00
2015-03-16 18:33:59 +00:00
// Load the entry
2018-08-02 01:39:39 +00:00
le , err := m . loadEntry ( ctx , leaseID )
2015-03-16 18:33:59 +00:00
if err != nil {
return nil , err
}
2015-04-09 21:23:37 +00:00
// Check if the lease is renewable
2017-05-04 02:03:42 +00:00
if _ , err := le . renewable ( ) ; err != nil {
2015-04-09 21:23:37 +00:00
return nil , err
2015-04-09 00:03:46 +00:00
}
2017-06-21 15:00:39 +00:00
if le . Secret == nil {
if le . Auth != nil {
2019-10-02 14:55:20 +00:00
return logical . ErrorResponse ( "tokens cannot be renewed through this endpoint" ) , nil
2017-06-21 15:00:39 +00:00
}
return logical . ErrorResponse ( "lease does not correspond to a secret" ) , nil
}
2019-03-01 00:02:25 +00:00
ns , err := namespace . FromContext ( ctx )
2018-09-18 03:03:00 +00:00
if err != nil {
return nil , err
}
2019-03-01 00:02:25 +00:00
if ns . ID != le . namespace . ID {
2018-09-18 03:03:00 +00:00
return nil , errors . New ( "cannot renew a lease across namespaces" )
}
sysViewCtx := namespace . ContextWithNamespace ( ctx , le . namespace )
sysView := m . router . MatchingSystemView ( sysViewCtx , le . Path )
2018-04-03 16:20:20 +00:00
if sysView == nil {
2018-04-05 15:49:21 +00:00
return nil , fmt . Errorf ( "unable to retrieve system view from router" )
2018-04-03 16:20:20 +00:00
}
2015-03-16 20:29:51 +00:00
// Attempt to renew the entry
2018-08-02 01:39:39 +00:00
resp , err := m . renewEntry ( ctx , le , increment )
2015-03-16 20:29:51 +00:00
if err != nil {
return nil , err
2015-03-16 18:33:59 +00:00
}
2018-04-03 16:20:20 +00:00
if resp == nil {
return nil , nil
}
if resp . IsError ( ) {
return & logical . Response {
Data : resp . Data ,
} , nil
}
if resp . Secret == nil {
return nil , nil
2015-03-16 18:33:59 +00:00
}
2018-04-03 16:20:20 +00:00
ttl , warnings , err := framework . CalculateTTL ( sysView , increment , resp . Secret . TTL , 0 , resp . Secret . MaxTTL , 0 , le . IssueTime )
if err != nil {
2015-03-16 20:29:51 +00:00
return nil , err
2015-03-16 18:33:59 +00:00
}
2018-04-03 16:20:20 +00:00
for _ , warning := range warnings {
resp . AddWarning ( warning )
}
resp . Secret . TTL = ttl
2015-03-16 18:33:59 +00:00
2015-04-08 20:35:32 +00:00
// Attach the LeaseID
resp . Secret . LeaseID = leaseID
2015-03-16 23:11:55 +00:00
2015-03-16 18:33:59 +00:00
// Update the lease entry
2015-03-16 20:29:51 +00:00
le . Data = resp . Data
2015-03-19 22:11:42 +00:00
le . Secret = resp . Secret
2015-04-09 19:29:13 +00:00
le . ExpireTime = resp . Secret . ExpirationTime ( )
2016-07-07 21:44:14 +00:00
le . LastRenewalTime = time . Now ( )
2015-03-16 18:33:59 +00:00
2018-10-15 16:56:24 +00:00
// If the token it's associated with is a batch token, constrain lease
// times
if le . ClientTokenType == logical . TokenTypeBatch {
te , err := m . tokenStore . Lookup ( ctx , le . ClientToken )
if err != nil {
return nil , err
}
if te == nil {
return nil , errors . New ( "cannot renew lease, no valid associated token" )
}
tokenLeaseTimes , err := m . FetchLeaseTimesByToken ( ctx , te )
if err != nil {
return nil , err
}
2020-09-22 21:47:13 +00:00
if tokenLeaseTimes == nil {
return nil , errors . New ( "failed to load batch token expiration time" )
}
2018-10-15 16:56:24 +00:00
if le . ExpireTime . After ( tokenLeaseTimes . ExpireTime ) {
resp . Secret . TTL = tokenLeaseTimes . ExpireTime . Sub ( le . LastRenewalTime )
le . ExpireTime = tokenLeaseTimes . ExpireTime
}
}
2018-06-08 21:24:44 +00:00
{
m . pendingLock . Lock ( )
2018-08-02 01:39:39 +00:00
if err := m . persistEntry ( ctx , le ) ; err != nil {
2018-06-08 21:24:44 +00:00
m . pendingLock . Unlock ( )
return nil , err
}
// Update the expiration time
2020-09-23 18:46:22 +00:00
m . updatePendingInternal ( le )
2018-06-08 21:24:44 +00:00
m . pendingLock . Unlock ( )
}
2015-03-16 18:33:59 +00:00
2015-03-16 20:29:51 +00:00
// Return the response
return resp , nil
2015-03-13 01:38:15 +00:00
}
2015-04-03 18:58:10 +00:00
// RenewToken is used to renew a token which does not need to
// invoke a logical backend.
2018-09-18 03:03:00 +00:00
func ( m * ExpirationManager ) RenewToken ( ctx context . Context , req * logical . Request , te * logical . TokenEntry ,
2016-03-04 19:56:51 +00:00
increment time . Duration ) ( * logical . Response , error ) {
2015-04-08 23:43:17 +00:00
defer metrics . MeasureSince ( [ ] string { "expire" , "renew-token" } , time . Now ( ) )
2017-09-05 15:09:00 +00:00
2018-09-18 03:03:00 +00:00
tokenNS , err := NamespaceByID ( ctx , te . NamespaceID , m . core )
if err != nil {
return nil , err
}
if tokenNS == nil {
return nil , namespace . ErrNoNamespace
}
ns , err := namespace . FromContext ( ctx )
if err != nil {
return nil , err
}
if ns . ID != tokenNS . ID {
return nil , errors . New ( "cannot renew a token across namespaces" )
}
2015-04-08 20:35:32 +00:00
// Compute the Lease ID
2018-09-18 03:03:00 +00:00
saltedID , err := m . tokenStore . SaltID ( ctx , te . ID )
2017-07-18 16:02:03 +00:00
if err != nil {
return nil , err
}
2018-09-18 03:03:00 +00:00
leaseID := path . Join ( te . Path , saltedID )
if ns . ID != namespace . RootNamespaceID {
leaseID = fmt . Sprintf ( "%s.%s" , leaseID , ns . ID )
}
2015-04-03 18:58:10 +00:00
// Load the entry
2018-08-02 01:39:39 +00:00
le , err := m . loadEntry ( ctx , leaseID )
2015-04-03 18:58:10 +00:00
if err != nil {
2015-04-06 23:35:39 +00:00
return nil , err
2015-04-03 18:58:10 +00:00
}
2018-09-18 03:03:00 +00:00
if le == nil {
return logical . ErrorResponse ( "invalid lease ID" ) , logical . ErrInvalidRequest
}
2015-04-03 18:58:10 +00:00
2016-03-01 17:33:35 +00:00
// Check if the lease is renewable. Note that this also checks for a nil
// lease and errors in that case as well.
2017-05-04 02:03:42 +00:00
if _ , err := le . renewable ( ) ; err != nil {
2016-07-06 20:42:34 +00:00
return logical . ErrorResponse ( err . Error ( ) ) , logical . ErrInvalidRequest
2015-04-03 18:58:10 +00:00
}
2015-04-09 21:23:37 +00:00
// Attempt to renew the auth entry
2018-08-02 01:39:39 +00:00
resp , err := m . renewAuthEntry ( ctx , req , le , increment )
2015-04-09 21:23:37 +00:00
if err != nil {
return nil , err
2015-04-03 18:58:10 +00:00
}
2015-04-09 21:23:37 +00:00
if resp == nil {
return nil , nil
}
2016-03-04 20:13:04 +00:00
if resp . IsError ( ) {
2016-03-04 20:35:58 +00:00
return & logical . Response {
Data : resp . Data ,
} , nil
2016-03-04 20:13:04 +00:00
}
2018-04-03 16:20:20 +00:00
if resp . Auth == nil {
return nil , nil
2015-04-09 00:03:46 +00:00
}
2018-09-18 03:03:00 +00:00
sysViewCtx := namespace . ContextWithNamespace ( ctx , le . namespace )
sysView := m . router . MatchingSystemView ( sysViewCtx , le . Path )
2017-12-15 18:30:05 +00:00
if sysView == nil {
2018-04-05 15:49:21 +00:00
return nil , fmt . Errorf ( "unable to retrieve system view from router" )
2017-12-15 18:30:05 +00:00
}
2018-04-03 16:20:20 +00:00
ttl , warnings , err := framework . CalculateTTL ( sysView , increment , resp . Auth . TTL , resp . Auth . Period , resp . Auth . MaxTTL , resp . Auth . ExplicitMaxTTL , le . IssueTime )
if err != nil {
return nil , err
}
2017-12-15 18:30:05 +00:00
retResp := & logical . Response { }
2018-04-03 16:20:20 +00:00
for _ , warning := range warnings {
retResp . AddWarning ( warning )
2017-12-15 18:30:05 +00:00
}
2018-04-03 16:20:20 +00:00
resp . Auth . TTL = ttl
2017-12-15 18:30:05 +00:00
2015-04-09 21:23:37 +00:00
// Attach the ClientToken
2018-09-18 03:03:00 +00:00
resp . Auth . ClientToken = te . ID
2015-04-09 21:23:37 +00:00
2018-08-23 01:53:04 +00:00
// Refresh groups
2020-03-23 22:00:26 +00:00
if resp . Auth . EntityID != "" && m . core . identityStore != nil {
2021-05-03 12:23:59 +00:00
mountAccessor := ""
if resp . Auth . Alias != nil {
mountAccessor = resp . Auth . Alias . MountAccessor
}
validAliases , err := m . core . identityStore . refreshExternalGroupMembershipsByEntityID ( ctx , resp . Auth . EntityID , resp . Auth . GroupAliases , mountAccessor )
2018-08-23 01:53:04 +00:00
if err != nil {
return nil , err
}
resp . Auth . GroupAliases = validAliases
}
2015-04-03 18:58:10 +00:00
// Update the lease entry
2015-04-09 21:23:37 +00:00
le . Auth = resp . Auth
le . ExpireTime = resp . Auth . ExpirationTime ( )
2016-07-07 21:44:14 +00:00
le . LastRenewalTime = time . Now ( )
2015-04-03 18:58:10 +00:00
2018-06-08 21:24:44 +00:00
{
m . pendingLock . Lock ( )
2018-08-02 01:39:39 +00:00
if err := m . persistEntry ( ctx , le ) ; err != nil {
2018-06-08 21:24:44 +00:00
m . pendingLock . Unlock ( )
return nil , err
}
// Update the expiration time
2020-09-23 18:46:22 +00:00
m . updatePendingInternal ( le )
2018-06-08 21:24:44 +00:00
m . pendingLock . Unlock ( )
}
2017-12-15 18:30:05 +00:00
retResp . Auth = resp . Auth
return retResp , nil
2015-04-03 18:58:10 +00:00
}
2015-03-13 01:38:15 +00:00
// Register is used to take a request and response with an associated
2015-04-08 20:35:32 +00:00
// lease. The secret gets assigned a LeaseID and the management of
2015-03-13 01:38:15 +00:00
// of lease is assumed by the expiration manager.
2018-08-02 01:39:39 +00:00
func ( m * ExpirationManager ) Register ( ctx context . Context , req * logical . Request , resp * logical . Response ) ( id string , retErr error ) {
2015-04-08 23:43:17 +00:00
defer metrics . MeasureSince ( [ ] string { "expire" , "register" } , time . Now ( ) )
2017-05-04 16:45:57 +00:00
2018-10-15 16:56:24 +00:00
te := req . TokenEntry ( )
if te == nil {
2018-04-05 15:49:21 +00:00
return "" , fmt . Errorf ( "cannot register a lease with an empty client token" )
2017-05-04 16:45:57 +00:00
}
2015-03-19 22:11:42 +00:00
// Ignore if there is no leased secret
2015-04-01 04:04:10 +00:00
if resp == nil || resp . Secret == nil {
2015-03-13 17:55:54 +00:00
return "" , nil
}
2015-03-19 22:11:42 +00:00
// Validate the secret
if err := resp . Secret . Validate ( ) ; err != nil {
2015-03-13 17:55:54 +00:00
return "" , err
}
2017-05-03 18:29:57 +00:00
// Create a lease entry
2018-12-20 15:40:01 +00:00
leaseRand , err := base62 . Random ( TokenLength )
2018-09-18 03:03:00 +00:00
if err != nil {
return "" , err
}
ns , err := namespace . FromContext ( ctx )
2017-05-03 18:29:57 +00:00
if err != nil {
return "" , err
}
2018-09-18 03:03:00 +00:00
leaseID := path . Join ( req . Path , leaseRand )
if ns . ID != namespace . RootNamespaceID {
leaseID = fmt . Sprintf ( "%s.%s" , leaseID , ns . ID )
}
le := & leaseEntry {
2018-10-15 16:56:24 +00:00
LeaseID : leaseID ,
ClientToken : req . ClientToken ,
ClientTokenType : te . Type ,
Path : req . Path ,
Data : resp . Data ,
Secret : resp . Secret ,
IssueTime : time . Now ( ) ,
ExpireTime : resp . Secret . ExpirationTime ( ) ,
namespace : ns ,
2019-11-07 22:10:47 +00:00
Version : 1 ,
2018-09-18 03:03:00 +00:00
}
2017-05-03 18:29:57 +00:00
2021-04-16 21:03:22 +00:00
var indexToken string
// Maintain secondary index by token, except for orphan batch tokens
switch {
case te . Type != logical . TokenTypeBatch :
indexToken = le . ClientToken
case te . Parent != "" :
// If it's a non-orphan batch token, assign the secondary index to its
// parent
indexToken = te . Parent
}
2017-05-03 16:17:09 +00:00
defer func ( ) {
2017-05-04 15:54:57 +00:00
// If there is an error we want to rollback as much as possible (note
// that errors here are ignored to do as much cleanup as we can). We
// want to revoke a generated secret (since an error means we may not
// be successfully tracking it), remove indexes, and delete the entry.
2017-05-03 16:17:09 +00:00
if retErr != nil {
2018-09-18 03:03:00 +00:00
revokeCtx := namespace . ContextWithNamespace ( m . quitContext , ns )
revResp , err := m . router . Route ( revokeCtx , logical . RevokeRequest ( req . Path , resp . Secret , resp . Data ) )
2017-05-03 16:17:09 +00:00
if err != nil {
2017-05-04 16:44:31 +00:00
retErr = multierror . Append ( retErr , errwrap . Wrapf ( "an additional internal error was encountered revoking the newly-generated secret: {{err}}" , err ) )
2017-05-03 16:36:10 +00:00
} else if revResp != nil && revResp . IsError ( ) {
retErr = multierror . Append ( retErr , errwrap . Wrapf ( "an additional error was encountered revoking the newly-generated secret: {{err}}" , revResp . Error ( ) ) )
2017-05-03 16:17:09 +00:00
}
2017-05-03 18:29:57 +00:00
2018-09-18 03:03:00 +00:00
if err := m . deleteEntry ( ctx , le ) ; err != nil {
2017-05-04 16:44:31 +00:00
retErr = multierror . Append ( retErr , errwrap . Wrapf ( "an additional error was encountered deleting any lease associated with the newly-generated secret: {{err}}" , err ) )
2017-05-03 18:29:57 +00:00
}
2021-04-16 21:03:22 +00:00
if err := m . removeIndexByToken ( ctx , le , indexToken ) ; err != nil {
2017-05-04 16:44:31 +00:00
retErr = multierror . Append ( retErr , errwrap . Wrapf ( "an additional error was encountered removing lease indexes associated with the newly-generated secret: {{err}}" , err ) )
2017-05-03 18:29:57 +00:00
}
2017-05-03 16:17:09 +00:00
}
} ( )
2018-10-15 16:56:24 +00:00
// If the token is a batch token, we want to constrain the maximum lifetime
// by the token's lifetime
if te . Type == logical . TokenTypeBatch {
tokenLeaseTimes , err := m . FetchLeaseTimesByToken ( ctx , te )
if err != nil {
return "" , err
}
2020-09-22 21:47:13 +00:00
if tokenLeaseTimes == nil {
return "" , errors . New ( "failed to load batch token expiration time" )
}
2018-10-15 16:56:24 +00:00
if le . ExpireTime . After ( tokenLeaseTimes . ExpireTime ) {
le . ExpireTime = tokenLeaseTimes . ExpireTime
}
}
2015-03-16 01:06:19 +00:00
// Encode the entry
2018-09-18 03:03:00 +00:00
if err := m . persistEntry ( ctx , le ) ; err != nil {
2015-03-16 01:06:19 +00:00
return "" , err
}
2021-04-16 21:03:22 +00:00
if indexToken != "" {
if err := m . createIndexByToken ( ctx , le , indexToken ) ; err != nil {
2018-10-15 16:56:24 +00:00
return "" , err
}
2015-04-10 21:48:08 +00:00
}
2015-04-01 04:01:12 +00:00
// Setup revocation timer if there is a lease
2020-09-23 18:46:22 +00:00
m . updatePending ( le )
2020-09-22 21:47:13 +00:00
// We round here because the clock will have already started
// ticking, so we'll end up always returning 299 instead of 300 or
// 26399 instead of 26400, say, even if it's just a few
// microseconds. This provides a nicer UX.
resp . Secret . TTL = le . ExpireTime . Sub ( time . Now ( ) ) . Round ( time . Second )
2015-03-16 01:06:19 +00:00
// Done
2015-04-08 20:35:32 +00:00
return le . LeaseID , nil
2015-03-16 01:06:19 +00:00
}
2015-04-03 00:45:42 +00:00
// RegisterAuth is used to take an Auth response with an associated lease.
2015-04-08 20:35:32 +00:00
// The token does not get a LeaseID, but the lease management is handled by
2015-04-03 00:45:42 +00:00
// the expiration manager.
2018-09-18 03:03:00 +00:00
func ( m * ExpirationManager ) RegisterAuth ( ctx context . Context , te * logical . TokenEntry , auth * logical . Auth ) error {
2015-04-08 23:43:17 +00:00
defer metrics . MeasureSince ( [ ] string { "expire" , "register-auth" } , time . Now ( ) )
2015-04-11 04:21:06 +00:00
2019-11-08 21:14:03 +00:00
// Triggers failure of RegisterAuth. This should only be set and triggered
// by tests to simulate partial failure during a token creation request.
if m . testRegisterAuthFailure . Load ( ) {
return fmt . Errorf ( "failing explicitly on RegisterAuth" )
}
2019-11-05 21:11:13 +00:00
authExpirationTime := auth . ExpirationTime ( )
if te . TTL == 0 && authExpirationTime . IsZero ( ) && ( len ( te . Policies ) != 1 || te . Policies [ 0 ] != "root" ) {
return errors . New ( "refusing to register a lease for a non-root token with no TTL" )
}
2018-10-15 16:56:24 +00:00
if te . Type == logical . TokenTypeBatch {
return errors . New ( "cannot register a lease for a batch token" )
}
2017-05-03 16:17:09 +00:00
if auth . ClientToken == "" {
2018-10-15 16:56:24 +00:00
return errors . New ( "cannot register an auth lease with an empty token" )
2017-05-03 16:17:09 +00:00
}
2018-09-18 03:03:00 +00:00
if strings . Contains ( te . Path , ".." ) {
2018-04-05 15:49:21 +00:00
return consts . ErrPathContainsParentReferences
2017-05-12 17:52:33 +00:00
}
2018-09-18 03:03:00 +00:00
tokenNS , err := NamespaceByID ( ctx , te . NamespaceID , m . core )
if err != nil {
return err
}
if tokenNS == nil {
return namespace . ErrNoNamespace
}
saltCtx := namespace . ContextWithNamespace ( ctx , tokenNS )
saltedID , err := m . tokenStore . SaltID ( saltCtx , auth . ClientToken )
2017-07-18 16:02:03 +00:00
if err != nil {
return err
}
2018-09-18 03:03:00 +00:00
leaseID := path . Join ( te . Path , saltedID )
if tokenNS . ID != namespace . RootNamespaceID {
leaseID = fmt . Sprintf ( "%s.%s" , leaseID , tokenNS . ID )
}
2015-03-24 01:11:15 +00:00
// Create a lease entry
le := leaseEntry {
2018-09-18 03:03:00 +00:00
LeaseID : leaseID ,
2015-04-10 21:48:08 +00:00
ClientToken : auth . ClientToken ,
Auth : auth ,
2018-09-18 03:03:00 +00:00
Path : te . Path ,
2016-07-07 21:44:14 +00:00
IssueTime : time . Now ( ) ,
2019-11-05 21:11:13 +00:00
ExpireTime : authExpirationTime ,
2018-09-18 03:03:00 +00:00
namespace : tokenNS ,
2019-11-07 22:10:47 +00:00
Version : 1 ,
2015-03-24 01:11:15 +00:00
}
// Encode the entry
2018-08-02 01:39:39 +00:00
if err := m . persistEntry ( ctx , & le ) ; err != nil {
2015-04-03 00:45:42 +00:00
return err
2015-03-24 01:11:15 +00:00
}
// Setup revocation timer
2020-09-23 18:46:22 +00:00
m . updatePending ( & le )
2018-06-03 22:14:51 +00:00
2015-04-09 19:39:12 +00:00
return nil
}
2016-01-04 21:43:07 +00:00
// FetchLeaseTimesByToken is a helper function to use token values to compute
// the leaseID, rather than pushing that logic back into the token store.
2018-10-15 16:56:24 +00:00
// As a special case, for a batch token it simply returns the information
// encoded on it.
2018-09-18 03:03:00 +00:00
func ( m * ExpirationManager ) FetchLeaseTimesByToken ( ctx context . Context , te * logical . TokenEntry ) ( * leaseEntry , error ) {
2016-01-04 21:43:07 +00:00
defer metrics . MeasureSince ( [ ] string { "expire" , "fetch-lease-times-by-token" } , time . Now ( ) )
2018-10-15 16:56:24 +00:00
if te == nil {
return nil , errors . New ( "cannot fetch lease times for nil token" )
}
if te . Type == logical . TokenTypeBatch {
issueTime := time . Unix ( te . CreationTime , 0 )
return & leaseEntry {
IssueTime : issueTime ,
ExpireTime : issueTime . Add ( te . TTL ) ,
ClientTokenType : logical . TokenTypeBatch ,
} , nil
}
2018-09-18 03:03:00 +00:00
tokenNS , err := NamespaceByID ( ctx , te . NamespaceID , m . core )
if err != nil {
return nil , err
}
if tokenNS == nil {
return nil , namespace . ErrNoNamespace
}
saltCtx := namespace . ContextWithNamespace ( ctx , tokenNS )
saltedID , err := m . tokenStore . SaltID ( saltCtx , te . ID )
2017-07-18 16:02:03 +00:00
if err != nil {
return nil , err
}
2018-09-18 03:03:00 +00:00
leaseID := path . Join ( te . Path , saltedID )
if tokenNS . ID != namespace . RootNamespaceID {
leaseID = fmt . Sprintf ( "%s.%s" , leaseID , tokenNS . ID )
}
2018-08-02 01:39:39 +00:00
return m . FetchLeaseTimes ( ctx , leaseID )
2016-01-04 21:43:07 +00:00
}
// FetchLeaseTimes is used to fetch the issue time, expiration time, and last
// renewed time of a lease entry. It returns a leaseEntry itself, but with only
// those values copied over.
2018-08-02 01:39:39 +00:00
func ( m * ExpirationManager ) FetchLeaseTimes ( ctx context . Context , leaseID string ) ( * leaseEntry , error ) {
2016-01-04 21:43:07 +00:00
defer metrics . MeasureSince ( [ ] string { "expire" , "fetch-lease-times" } , time . Now ( ) )
2020-05-21 17:41:03 +00:00
info , ok := m . pending . Load ( leaseID )
2020-06-15 23:54:36 +00:00
if ok && info . ( pendingInfo ) . cachedLeaseInfo != nil {
return m . leaseTimesForExport ( info . ( pendingInfo ) . cachedLeaseInfo ) , nil
2018-06-11 15:58:56 +00:00
}
2021-04-29 15:12:02 +00:00
info , ok = m . zombies . Load ( leaseID )
if ok && info . ( * leaseEntry ) != nil {
return m . leaseTimesForExport ( info . ( * leaseEntry ) ) , nil
}
2016-01-04 21:43:07 +00:00
// Load the entry
2018-09-18 03:03:00 +00:00
le , err := m . loadEntryInternal ( ctx , leaseID , true , false )
2016-01-04 21:43:07 +00:00
if err != nil {
return nil , err
}
if le == nil {
return nil , nil
}
2018-06-11 15:58:56 +00:00
return m . leaseTimesForExport ( le ) , nil
}
// Returns lease times for outside callers based on the full leaseEntry passed in
func ( m * ExpirationManager ) leaseTimesForExport ( le * leaseEntry ) * leaseEntry {
2016-01-04 21:43:07 +00:00
ret := & leaseEntry {
IssueTime : le . IssueTime ,
ExpireTime : le . ExpireTime ,
LastRenewalTime : le . LastRenewalTime ,
}
2016-06-08 13:19:39 +00:00
if le . Secret != nil {
ret . Secret = & logical . Secret { }
ret . Secret . Renewable = le . Secret . Renewable
ret . Secret . TTL = le . Secret . TTL
}
if le . Auth != nil {
ret . Auth = & logical . Auth { }
ret . Auth . Renewable = le . Auth . Renewable
ret . Auth . TTL = le . Auth . TTL
}
2016-01-04 21:43:07 +00:00
2018-06-11 15:58:56 +00:00
return ret
2016-01-04 21:43:07 +00:00
}
2020-06-15 23:54:36 +00:00
// Restricts lease entry stored in pendingInfo to a low-cost subset of the
// information.
func ( m * ExpirationManager ) inMemoryLeaseInfo ( le * leaseEntry ) * leaseEntry {
ret := m . leaseTimesForExport ( le )
// Need to index:
// namespace -- derived from lease ID
// policies -- stored in Auth object
// auth method -- derived from lease.Path
if le . Auth != nil {
// Ensure that list of policies is not copied more than
// once. This method is called with pendingLock held.
// We could use hashstructure here to generate a key, but that
// seems like it would be substantially slower?
key := strings . Join ( le . Auth . Policies , "\n" )
uniq , ok := m . uniquePolicies [ key ]
if ok {
ret . Auth . Policies = uniq
} else {
m . uniquePolicies [ key ] = le . Auth . Policies
ret . Auth . Policies = le . Auth . Policies
}
ret . Path = le . Path
}
return ret
}
func ( m * ExpirationManager ) uniquePoliciesGc ( ) {
for {
<- m . emptyUniquePolicies . C
// If the maximum lease is a month, and we blow away the unique
// policy cache every week, the pessimal case is 4x larger space
// utilization than keeping the cache indefinitely.
m . pendingLock . Lock ( )
m . uniquePolicies = make ( map [ string ] [ ] string )
m . pendingLock . Unlock ( )
}
}
2015-04-09 19:39:12 +00:00
// updatePending is used to update a pending invocation for a lease
2020-09-23 18:46:22 +00:00
func ( m * ExpirationManager ) updatePending ( le * leaseEntry ) {
2015-04-09 19:39:12 +00:00
m . pendingLock . Lock ( )
defer m . pendingLock . Unlock ( )
2018-06-11 15:58:56 +00:00
2020-09-23 18:46:22 +00:00
m . updatePendingInternal ( le )
2018-06-08 21:24:44 +00:00
}
2015-04-09 19:39:12 +00:00
2018-06-11 15:58:56 +00:00
// updatePendingInternal is the locked version of updatePending; do not call
// this without a write lock on m.pending
2020-09-23 18:46:22 +00:00
func ( m * ExpirationManager ) updatePendingInternal ( le * leaseEntry ) {
2021-04-29 15:12:02 +00:00
if le . isZombie ( ) {
return
}
2020-05-21 17:41:03 +00:00
2015-04-09 19:39:12 +00:00
// Check for an existing timer
2020-05-21 17:41:03 +00:00
info , ok := m . pending . Load ( le . LeaseID )
2015-04-09 19:39:12 +00:00
2021-04-29 15:12:02 +00:00
var pending pendingInfo
2017-09-05 15:09:00 +00:00
if le . ExpireTime . IsZero ( ) {
2020-06-15 23:54:36 +00:00
if le . nonexpiringToken ( ) {
// Store this in the nonexpiring map instead of pending.
// There does not appear to be any cases where a token that had
// a nonzero can be can be assigned a zero TTL, but we can handle that
// anyway by falling through to the next check.
pending . cachedLeaseInfo = m . inMemoryLeaseInfo ( le )
m . nonexpiring . Store ( le . LeaseID , pending )
}
2017-09-05 15:09:00 +00:00
// if the timer happened to exist, stop the time and delete it from the
// pending timers.
if ok {
2020-05-21 17:41:03 +00:00
info . ( pendingInfo ) . timer . Stop ( )
m . pending . Delete ( le . LeaseID )
m . leaseCount --
2020-06-26 21:13:16 +00:00
if err := m . core . quotasHandleLeases ( m . quitContext , quotas . LeaseActionDeleted , [ ] string { le . LeaseID } ) ; err != nil {
2020-07-01 19:41:42 +00:00
m . logger . Error ( "failed to update quota on lease deletion" , "error" , err )
2020-06-26 21:13:16 +00:00
return
}
2017-09-05 15:09:00 +00:00
}
return
}
2020-09-23 18:46:22 +00:00
leaseTotal := le . ExpireTime . Sub ( time . Now ( ) )
2020-06-26 21:13:16 +00:00
leaseCreated := false
2018-06-11 15:58:56 +00:00
// Create entry if it does not exist or reset if it does
if ok {
2020-05-21 17:41:03 +00:00
pending = info . ( pendingInfo )
2018-06-11 15:58:56 +00:00
pending . timer . Reset ( leaseTotal )
2020-05-21 17:41:03 +00:00
// No change to lease count in this case
2018-06-11 15:58:56 +00:00
} else {
2021-01-19 22:51:41 +00:00
leaseID , namespace := le . LeaseID , le . namespace
2020-06-15 23:54:36 +00:00
// Extend the timer by the lease total
2015-04-09 19:39:12 +00:00
timer := time . AfterFunc ( leaseTotal , func ( ) {
2021-01-19 22:51:41 +00:00
m . expireFunc ( m . quitContext , m , leaseID , namespace )
2015-04-08 22:43:26 +00:00
} )
2018-06-11 15:58:56 +00:00
pending = pendingInfo {
timer : timer ,
}
2020-05-21 17:41:03 +00:00
// new lease
m . leaseCount ++
2020-06-26 21:13:16 +00:00
leaseCreated = true
2015-04-09 19:39:12 +00:00
}
2020-06-15 23:54:36 +00:00
// Retain some information in-memory
pending . cachedLeaseInfo = m . inMemoryLeaseInfo ( le )
2018-06-11 15:58:56 +00:00
2020-05-21 17:41:03 +00:00
m . pending . Store ( le . LeaseID , pending )
2020-06-26 21:13:16 +00:00
if leaseCreated {
if err := m . core . quotasHandleLeases ( m . quitContext , quotas . LeaseActionCreated , [ ] string { le . LeaseID } ) ; err != nil {
2020-07-01 19:41:42 +00:00
m . logger . Error ( "failed to update quota on lease creation" , "error" , err )
2020-06-26 21:13:16 +00:00
return
}
}
2015-03-24 01:11:15 +00:00
}
2015-03-16 01:06:19 +00:00
// revokeEntry is used to attempt revocation of an internal entry
2018-08-02 01:39:39 +00:00
func ( m * ExpirationManager ) revokeEntry ( ctx context . Context , le * leaseEntry ) error {
2015-03-24 01:11:15 +00:00
// Revocation of login tokens is special since we can by-pass the
// backend and directly interact with the token store
2015-04-03 18:58:10 +00:00
if le . Auth != nil {
2018-10-15 16:56:24 +00:00
if le . ClientTokenType == logical . TokenTypeBatch {
return errors . New ( "batch tokens cannot be revoked" )
}
2018-09-18 03:03:00 +00:00
if err := m . tokenStore . revokeTree ( ctx , le ) ; err != nil {
2018-04-05 15:49:21 +00:00
return errwrap . Wrapf ( "failed to revoke token: {{err}}" , err )
2015-03-24 01:11:15 +00:00
}
2015-09-10 01:58:09 +00:00
2015-03-24 01:11:15 +00:00
return nil
}
logical/aws: Harden WAL entry creation (#5202)
* logical/aws: Harden WAL entry creation
If AWS IAM user creation failed in any way, the WAL corresponding to the
IAM user would get left around and Vault would try to roll it back.
However, because the user never existed, the rollback failed. Thus, the
WAL would essentially get "stuck" and Vault would continually attempt to
roll it back, failing every time. A similar situation could arise if the
IAM user that Vault created got deleted out of band, or if Vault deleted
it but was unable to write the lease revocation back to storage (e.g., a
storage failure).
This attempts to harden it in two ways. One is by deleting the WAL log
entry if the IAM user creation fails. However, the WAL deletion could
still fail, and this wouldn't help where the user is deleted out of
band, so second, consider the user rolled back if the user just doesn't
exist, under certain circumstances.
Fixes #5190
* Fix segfault in expiration unit tests
TestExpiration_Tidy was passing in a leaseEntry that had a nil Secret,
which then caused a segfault as the changes to revokeEntry didn't check
whether Secret was nil; this is probably unlikely to occur in real life,
but good to be extra cautious.
* Fix potential segfault
Missed the else...
* Respond to PR feedback
2018-09-27 14:54:59 +00:00
if le . Secret != nil {
// not sure if this is really valid to have a leaseEntry with a nil Secret
// (if there's a nil Secret, what are you really leasing?), but the tests
// create one, and good to be defensive
le . Secret . IssueTime = le . IssueTime
}
2018-09-18 03:03:00 +00:00
// Make sure we're operating in the right namespace
nsCtx := namespace . ContextWithNamespace ( ctx , le . namespace )
2015-03-24 01:11:15 +00:00
// Handle standard revocation via backends
2018-09-18 03:03:00 +00:00
resp , err := m . router . Route ( nsCtx , logical . RevokeRequest ( le . Path , le . Secret , le . Data ) )
2016-05-26 03:24:10 +00:00
if err != nil || ( resp != nil && resp . IsError ( ) ) {
2018-04-05 15:49:21 +00:00
return errwrap . Wrapf ( fmt . Sprintf ( "failed to revoke entry: resp: %#v err: {{err}}" , resp ) , err )
2015-03-16 18:33:59 +00:00
}
return nil
2015-03-16 01:06:19 +00:00
}
2015-03-16 20:29:51 +00:00
// renewEntry is used to attempt renew of an internal entry
2018-08-02 01:39:39 +00:00
func ( m * ExpirationManager ) renewEntry ( ctx context . Context , le * leaseEntry , increment time . Duration ) ( * logical . Response , error ) {
2015-03-19 22:11:42 +00:00
secret := * le . Secret
2015-08-21 05:27:01 +00:00
secret . IssueTime = le . IssueTime
secret . Increment = increment
2015-04-08 20:35:32 +00:00
secret . LeaseID = ""
2018-09-18 03:03:00 +00:00
// Make sure we're operating in the right namespace
nsCtx := namespace . ContextWithNamespace ( ctx , le . namespace )
2015-04-09 21:23:37 +00:00
req := logical . RenewRequest ( le . Path , & secret , le . Data )
2018-09-18 03:03:00 +00:00
resp , err := m . router . Route ( nsCtx , req )
2016-05-26 03:24:10 +00:00
if err != nil || ( resp != nil && resp . IsError ( ) ) {
2018-04-05 15:49:21 +00:00
return nil , errwrap . Wrapf ( fmt . Sprintf ( "failed to renew entry: resp: %#v err: {{err}}" , resp ) , err )
2015-04-09 21:23:37 +00:00
}
return resp , nil
}
2016-03-09 16:07:13 +00:00
// renewAuthEntry is used to attempt renew of an auth entry. Only the token
// store should get the actual token ID intact.
2018-08-02 01:39:39 +00:00
func ( m * ExpirationManager ) renewAuthEntry ( ctx context . Context , req * logical . Request , le * leaseEntry , increment time . Duration ) ( * logical . Response , error ) {
2018-10-15 16:56:24 +00:00
if le . ClientTokenType == logical . TokenTypeBatch {
return logical . ErrorResponse ( "batch tokens cannot be renewed" ) , nil
}
2015-04-09 21:23:37 +00:00
auth := * le . Auth
2015-08-21 05:27:01 +00:00
auth . IssueTime = le . IssueTime
auth . Increment = increment
2016-03-09 16:07:13 +00:00
if strings . HasPrefix ( le . Path , "auth/token/" ) {
auth . ClientToken = le . ClientToken
} else {
auth . ClientToken = ""
}
2015-04-09 21:23:37 +00:00
2018-09-18 03:03:00 +00:00
// Make sure we're operating in the right namespace
nsCtx := namespace . ContextWithNamespace ( ctx , le . namespace )
2016-02-18 16:22:04 +00:00
authReq := logical . RenewAuthRequest ( le . Path , & auth , nil )
authReq . Connection = req . Connection
2018-09-18 03:03:00 +00:00
resp , err := m . router . Route ( nsCtx , authReq )
2015-03-16 20:29:51 +00:00
if err != nil {
2018-04-05 15:49:21 +00:00
return nil , errwrap . Wrapf ( "failed to renew entry: {{err}}" , err )
2015-03-16 20:29:51 +00:00
}
return resp , nil
}
2015-03-16 01:06:19 +00:00
// loadEntry is used to read a lease entry
2018-08-02 01:39:39 +00:00
func ( m * ExpirationManager ) loadEntry ( ctx context . Context , leaseID string ) ( * leaseEntry , error ) {
2017-09-05 15:09:00 +00:00
// Take out the lease locks after we ensure we are in restore mode
restoreMode := m . inRestoreMode ( )
if restoreMode {
m . restoreModeLock . RLock ( )
defer m . restoreModeLock . RUnlock ( )
restoreMode = m . inRestoreMode ( )
if restoreMode {
m . lockLease ( leaseID )
defer m . unlockLease ( leaseID )
}
}
2018-09-18 03:03:00 +00:00
_ , nsID := namespace . SplitIDFromString ( leaseID )
if nsID != "" {
leaseNS , err := NamespaceByID ( ctx , nsID , m . core )
if err != nil {
return nil , err
}
if leaseNS != nil {
ctx = namespace . ContextWithNamespace ( ctx , leaseNS )
}
} else {
ctx = namespace . ContextWithNamespace ( ctx , namespace . RootNamespace )
}
2018-08-02 01:39:39 +00:00
return m . loadEntryInternal ( ctx , leaseID , restoreMode , true )
2017-09-05 15:09:00 +00:00
}
// loadEntryInternal is used when you need to load an entry but also need to
// control the lifecycle of the restoreLock
2018-08-02 01:39:39 +00:00
func ( m * ExpirationManager ) loadEntryInternal ( ctx context . Context , leaseID string , restoreMode bool , checkRestored bool ) ( * leaseEntry , error ) {
2018-09-18 03:03:00 +00:00
ns , err := namespace . FromContext ( ctx )
if err != nil {
return nil , err
}
view := m . leaseView ( ns )
out , err := view . Get ( ctx , leaseID )
2015-03-16 01:06:19 +00:00
if err != nil {
2018-11-16 13:07:06 +00:00
return nil , errwrap . Wrapf ( fmt . Sprintf ( "failed to read lease entry %s: {{err}}" , leaseID ) , err )
2015-03-16 01:06:19 +00:00
}
if out == nil {
return nil , nil
}
le , err := decodeLeaseEntry ( out . Value )
if err != nil {
2018-11-16 13:07:06 +00:00
return nil , errwrap . Wrapf ( fmt . Sprintf ( "failed to decode lease entry %s: {{err}}" , leaseID ) , err )
2015-03-16 01:06:19 +00:00
}
2018-09-18 03:03:00 +00:00
le . namespace = ns
2017-09-05 15:09:00 +00:00
2021-04-29 15:12:02 +00:00
if le . isZombie ( ) {
m . zombies . Store ( le . LeaseID , le )
return le , nil
}
2017-09-05 15:09:00 +00:00
if restoreMode {
if checkRestored {
// If we have already loaded this lease, we don't need to update on
// load. In the case of renewal and revocation, updatePending will be
// done after making the appropriate modifications to the lease.
if _ , ok := m . restoreLoaded . Load ( leaseID ) ; ok {
return le , nil
}
}
// Update the cache of restored leases, either synchronously or through
// the lazy loaded restore process
m . restoreLoaded . Store ( le . LeaseID , struct { } { } )
// Setup revocation timer
2020-09-23 18:46:22 +00:00
m . updatePending ( le )
2017-09-05 15:09:00 +00:00
}
2015-03-16 01:06:19 +00:00
return le , nil
}
// persistEntry is used to persist a lease entry
2018-08-02 01:39:39 +00:00
func ( m * ExpirationManager ) persistEntry ( ctx context . Context , le * leaseEntry ) error {
2015-03-13 17:55:54 +00:00
// Encode the entry
buf , err := le . encode ( )
if err != nil {
2018-04-05 15:49:21 +00:00
return errwrap . Wrapf ( "failed to encode lease entry: {{err}}" , err )
2015-03-13 17:55:54 +00:00
}
// Write out to the view
2015-03-15 20:52:43 +00:00
ent := logical . StorageEntry {
2015-04-08 20:35:32 +00:00
Key : le . LeaseID ,
2015-03-13 17:55:54 +00:00
Value : buf ,
}
2017-11-06 18:10:36 +00:00
if le . Auth != nil && len ( le . Auth . Policies ) == 1 && le . Auth . Policies [ 0 ] == "root" {
ent . SealWrap = true
}
2018-09-18 03:03:00 +00:00
view := m . leaseView ( le . namespace )
if err := view . Put ( ctx , & ent ) ; err != nil {
2018-04-05 15:49:21 +00:00
return errwrap . Wrapf ( "failed to persist lease entry: {{err}}" , err )
2015-03-13 17:55:54 +00:00
}
2015-03-16 01:06:19 +00:00
return nil
}
2015-03-13 17:55:54 +00:00
2015-03-16 01:06:19 +00:00
// deleteEntry is used to delete a lease entry
2018-09-18 03:03:00 +00:00
func ( m * ExpirationManager ) deleteEntry ( ctx context . Context , le * leaseEntry ) error {
view := m . leaseView ( le . namespace )
if err := view . Delete ( ctx , le . LeaseID ) ; err != nil {
2018-04-05 15:49:21 +00:00
return errwrap . Wrapf ( "failed to delete lease entry: {{err}}" , err )
2015-03-16 01:06:19 +00:00
}
return nil
2015-03-13 17:55:54 +00:00
}
2015-11-04 15:48:44 +00:00
// createIndexByToken creates a secondary index from the token to a lease entry
2018-10-15 16:56:24 +00:00
func ( m * ExpirationManager ) createIndexByToken ( ctx context . Context , le * leaseEntry , token string ) error {
2018-09-18 03:03:00 +00:00
tokenNS := namespace . RootNamespace
saltCtx := namespace . ContextWithNamespace ( ctx , namespace . RootNamespace )
2018-10-15 16:56:24 +00:00
_ , nsID := namespace . SplitIDFromString ( token )
2018-09-18 03:03:00 +00:00
if nsID != "" {
2019-11-07 22:10:47 +00:00
var err error
tokenNS , err = NamespaceByID ( ctx , nsID , m . core )
2018-09-18 03:03:00 +00:00
if err != nil {
return err
}
if tokenNS != nil {
saltCtx = namespace . ContextWithNamespace ( ctx , tokenNS )
}
}
2018-10-15 16:56:24 +00:00
saltedID , err := m . tokenStore . SaltID ( saltCtx , token )
2017-07-18 16:02:03 +00:00
if err != nil {
return err
}
2018-09-18 03:03:00 +00:00
leaseSaltedID , err := m . tokenStore . SaltID ( saltCtx , le . LeaseID )
2017-07-18 16:02:03 +00:00
if err != nil {
return err
}
2015-04-10 21:48:08 +00:00
ent := logical . StorageEntry {
2017-07-18 16:02:03 +00:00
Key : saltedID + "/" + leaseSaltedID ,
2018-09-18 03:03:00 +00:00
Value : [ ] byte ( le . LeaseID ) ,
2015-04-10 21:48:08 +00:00
}
2018-09-18 03:03:00 +00:00
tokenView := m . tokenIndexView ( tokenNS )
if err := tokenView . Put ( ctx , & ent ) ; err != nil {
2018-04-05 15:49:21 +00:00
return errwrap . Wrapf ( "failed to persist lease index entry: {{err}}" , err )
2015-04-10 21:48:08 +00:00
}
return nil
}
2015-11-04 15:48:44 +00:00
// indexByToken looks up the secondary index from the token to a lease entry
2018-09-18 03:03:00 +00:00
func ( m * ExpirationManager ) indexByToken ( ctx context . Context , le * leaseEntry ) ( * logical . StorageEntry , error ) {
tokenNS := namespace . RootNamespace
saltCtx := namespace . ContextWithNamespace ( ctx , tokenNS )
_ , nsID := namespace . SplitIDFromString ( le . ClientToken )
if nsID != "" {
2019-11-07 22:10:47 +00:00
var err error
tokenNS , err = NamespaceByID ( ctx , nsID , m . core )
2018-09-18 03:03:00 +00:00
if err != nil {
return nil , err
}
if tokenNS != nil {
saltCtx = namespace . ContextWithNamespace ( ctx , tokenNS )
}
}
saltedID , err := m . tokenStore . SaltID ( saltCtx , le . ClientToken )
2017-07-18 16:02:03 +00:00
if err != nil {
return nil , err
}
2018-09-18 03:03:00 +00:00
leaseSaltedID , err := m . tokenStore . SaltID ( saltCtx , le . LeaseID )
2017-07-18 16:02:03 +00:00
if err != nil {
return nil , err
}
key := saltedID + "/" + leaseSaltedID
2018-09-18 03:03:00 +00:00
tokenView := m . tokenIndexView ( tokenNS )
entry , err := tokenView . Get ( ctx , key )
2015-11-04 15:48:44 +00:00
if err != nil {
return nil , fmt . Errorf ( "failed to look up secondary index entry" )
}
return entry , nil
}
2015-04-10 21:48:08 +00:00
// removeIndexByToken removes the secondary index from the token to a lease entry
2021-04-16 21:03:22 +00:00
func ( m * ExpirationManager ) removeIndexByToken ( ctx context . Context , le * leaseEntry , token string ) error {
2018-09-18 03:03:00 +00:00
tokenNS := namespace . RootNamespace
saltCtx := namespace . ContextWithNamespace ( ctx , namespace . RootNamespace )
2021-04-16 21:03:22 +00:00
_ , nsID := namespace . SplitIDFromString ( token )
2018-09-18 03:03:00 +00:00
if nsID != "" {
2019-11-07 22:10:47 +00:00
var err error
tokenNS , err = NamespaceByID ( ctx , nsID , m . core )
2018-09-18 03:03:00 +00:00
if err != nil {
return err
}
if tokenNS != nil {
saltCtx = namespace . ContextWithNamespace ( ctx , tokenNS )
}
2019-11-07 22:10:47 +00:00
// Downgrade logic for old-style (V0) namespace leases that had its
// secondary index live in the root namespace. This reverts to the old
// behavior of looking for the secondary index on these leases in the
// root namespace to be cleaned up properly. We set it here because the
// old behavior used the namespace's token store salt for its saltCtx.
if le . Version < 1 {
tokenNS = namespace . RootNamespace
}
2018-09-18 03:03:00 +00:00
}
2021-04-16 21:03:22 +00:00
saltedID , err := m . tokenStore . SaltID ( saltCtx , token )
2017-07-18 16:02:03 +00:00
if err != nil {
return err
}
2018-09-18 03:03:00 +00:00
leaseSaltedID , err := m . tokenStore . SaltID ( saltCtx , le . LeaseID )
2017-07-18 16:02:03 +00:00
if err != nil {
return err
}
key := saltedID + "/" + leaseSaltedID
2018-09-18 03:03:00 +00:00
tokenView := m . tokenIndexView ( tokenNS )
if err := tokenView . Delete ( ctx , key ) ; err != nil {
2018-04-05 15:49:21 +00:00
return errwrap . Wrapf ( "failed to delete lease index entry: {{err}}" , err )
2015-04-10 21:48:08 +00:00
}
return nil
}
2018-05-10 19:50:02 +00:00
// CreateOrFetchRevocationLeaseByToken is used to create or fetch the matching
// leaseID for a particular token. The lease is set to expire immediately after
// it's created.
2018-08-02 01:39:39 +00:00
func ( m * ExpirationManager ) CreateOrFetchRevocationLeaseByToken ( ctx context . Context , te * logical . TokenEntry ) ( string , error ) {
2018-05-10 19:50:02 +00:00
// Fetch the saltedID of the token and construct the leaseID
2018-09-18 03:03:00 +00:00
tokenNS , err := NamespaceByID ( ctx , te . NamespaceID , m . core )
if err != nil {
return "" , err
}
if tokenNS == nil {
return "" , namespace . ErrNoNamespace
}
saltCtx := namespace . ContextWithNamespace ( ctx , tokenNS )
saltedID , err := m . tokenStore . SaltID ( saltCtx , te . ID )
2018-05-10 19:50:02 +00:00
if err != nil {
return "" , err
}
leaseID := path . Join ( te . Path , saltedID )
2018-09-18 03:03:00 +00:00
if tokenNS . ID != namespace . RootNamespaceID {
leaseID = fmt . Sprintf ( "%s.%s" , leaseID , tokenNS . ID )
}
2018-05-10 19:50:02 +00:00
// Load the entry
2018-08-02 01:39:39 +00:00
le , err := m . loadEntry ( ctx , leaseID )
2018-05-10 19:50:02 +00:00
if err != nil {
return "" , err
}
// If there's no associated leaseEntry for the token, we create one
if le == nil {
auth := & logical . Auth {
ClientToken : te . ID ,
LeaseOptions : logical . LeaseOptions {
TTL : time . Nanosecond ,
} ,
}
if strings . Contains ( te . Path , ".." ) {
return "" , consts . ErrPathContainsParentReferences
}
// Create a lease entry
now := time . Now ( )
le = & leaseEntry {
LeaseID : leaseID ,
ClientToken : auth . ClientToken ,
Auth : auth ,
Path : te . Path ,
IssueTime : now ,
ExpireTime : now . Add ( time . Nanosecond ) ,
2018-09-18 03:03:00 +00:00
namespace : tokenNS ,
2019-11-07 22:10:47 +00:00
Version : 1 ,
2018-05-10 19:50:02 +00:00
}
// Encode the entry
2018-08-02 01:39:39 +00:00
if err := m . persistEntry ( ctx , le ) ; err != nil {
2018-05-10 19:50:02 +00:00
return "" , err
}
}
return le . LeaseID , nil
}
// lookupLeasesByToken is used to lookup all the leaseID's via the tokenID
2018-09-18 03:03:00 +00:00
func ( m * ExpirationManager ) lookupLeasesByToken ( ctx context . Context , te * logical . TokenEntry ) ( [ ] string , error ) {
tokenNS , err := NamespaceByID ( ctx , te . NamespaceID , m . core )
2017-07-18 16:02:03 +00:00
if err != nil {
return nil , err
}
2018-09-18 03:03:00 +00:00
if tokenNS == nil {
return nil , namespace . ErrNoNamespace
}
saltCtx := namespace . ContextWithNamespace ( ctx , tokenNS )
saltedID , err := m . tokenStore . SaltID ( saltCtx , te . ID )
if err != nil {
return nil , err
}
tokenView := m . tokenIndexView ( tokenNS )
2017-07-18 16:02:03 +00:00
2015-04-10 21:48:08 +00:00
// Scan via the index for sub-leases
2017-07-18 16:02:03 +00:00
prefix := saltedID + "/"
2018-09-18 03:03:00 +00:00
subKeys , err := tokenView . List ( ctx , prefix )
2015-04-10 21:48:08 +00:00
if err != nil {
2018-04-05 15:49:21 +00:00
return nil , errwrap . Wrapf ( "failed to list leases: {{err}}" , err )
2015-04-10 21:48:08 +00:00
}
// Read each index entry
leaseIDs := make ( [ ] string , 0 , len ( subKeys ) )
for _ , sub := range subKeys {
2018-09-18 03:03:00 +00:00
out , err := tokenView . Get ( ctx , prefix + sub )
2015-04-10 21:48:08 +00:00
if err != nil {
2018-04-05 15:49:21 +00:00
return nil , errwrap . Wrapf ( "failed to read lease index: {{err}}" , err )
2015-04-10 21:48:08 +00:00
}
if out == nil {
continue
}
leaseIDs = append ( leaseIDs , string ( out . Value ) )
}
2019-11-07 22:10:47 +00:00
// Downgrade logic for old-style (V0) leases entries created by a namespace
// token that lived in the root namespace.
if tokenNS . ID != namespace . RootNamespaceID {
tokenView := m . tokenIndexView ( namespace . RootNamespace )
// Scan via the index for sub-leases on the root namespace
prefix := saltedID + "/"
subKeys , err := tokenView . List ( ctx , prefix )
if err != nil {
return nil , errwrap . Wrapf ( "failed to list leases on root namespace: {{err}}" , err )
}
for _ , sub := range subKeys {
out , err := tokenView . Get ( ctx , prefix + sub )
if err != nil {
return nil , errwrap . Wrapf ( "failed to read lease index on root namespace: {{err}}" , err )
}
if out == nil {
continue
}
leaseIDs = append ( leaseIDs , string ( out . Value ) )
}
}
2015-04-10 21:48:08 +00:00
return leaseIDs , nil
}
2015-04-08 23:43:17 +00:00
// emitMetrics is invoked periodically to emit statistics
func ( m * ExpirationManager ) emitMetrics ( ) {
2020-05-21 17:41:03 +00:00
// All updates of this value are with the pendingLock held.
2017-09-05 15:09:00 +00:00
m . pendingLock . RLock ( )
2020-05-21 17:41:03 +00:00
num := m . leaseCount
2017-09-05 15:09:00 +00:00
m . pendingLock . RUnlock ( )
2020-05-21 17:41:03 +00:00
2015-04-08 23:43:17 +00:00
metrics . SetGauge ( [ ] string { "expire" , "num_leases" } , float32 ( num ) )
2018-02-14 14:22:46 +00:00
// Check if lease count is greater than the threshold
if num > maxLeaseThreshold {
2018-06-09 19:35:22 +00:00
if atomic . LoadUint32 ( m . leaseCheckCounter ) > 59 {
2020-12-14 23:00:19 +00:00
m . logger . Warn ( "lease count exceeds warning lease threshold" , "have" , num , "threshold" , maxLeaseThreshold )
2018-06-09 19:35:22 +00:00
atomic . StoreUint32 ( m . leaseCheckCounter , 0 )
2018-02-14 14:22:46 +00:00
} else {
2018-06-09 19:35:22 +00:00
atomic . AddUint32 ( m . leaseCheckCounter , 1 )
2018-02-14 14:22:46 +00:00
}
}
2015-04-08 23:43:17 +00:00
}
2020-11-13 18:26:58 +00:00
func ( m * ExpirationManager ) leaseAggregationMetrics ( ctx context . Context , consts metricsutil . TelemetryConstConfig ) ( [ ] metricsutil . GaugeLabelValues , error ) {
expiryTimes := make ( map [ metricsutil . LeaseExpiryLabel ] int )
leaseEpsilon := consts . LeaseMetricsEpsilon
nsLabel := consts . LeaseMetricsNameSpaceLabels
rollingWindow := time . Now ( ) . Add ( time . Duration ( consts . NumLeaseMetricsTimeBuckets ) * leaseEpsilon )
err := m . walkLeases ( func ( entryID string , expireTime time . Time ) bool {
select {
// Abort and return empty collection if it's taking too much time, nonblocking check.
case <- ctx . Done ( ) :
return false
default :
if entryID == "" {
return true
}
_ , nsID := namespace . SplitIDFromString ( entryID )
if nsID == "" {
nsID = "root" // this is what metricsutil.NamespaceLabel does
}
label := metricsutil . ExpiryBucket ( expireTime , leaseEpsilon , rollingWindow , nsID , nsLabel )
if label != nil {
expiryTimes [ * label ] += 1
}
return true
}
} )
if err != nil {
return [ ] metricsutil . GaugeLabelValues { } , suppressRestoreModeError ( err )
}
// If collection was cancelled, return an empty array.
select {
case <- ctx . Done ( ) :
return [ ] metricsutil . GaugeLabelValues { } , nil
default :
break
}
flattenedResults := make ( [ ] metricsutil . GaugeLabelValues , 0 , len ( expiryTimes ) )
for bucket , count := range expiryTimes {
if nsLabel {
flattenedResults = append ( flattenedResults ,
metricsutil . GaugeLabelValues {
Labels : [ ] metrics . Label { { "expiring" , bucket . LabelName } , { "namespace" , bucket . LabelNS } } ,
Value : float32 ( count ) ,
} )
} else {
flattenedResults = append ( flattenedResults ,
metricsutil . GaugeLabelValues {
Labels : [ ] metrics . Label { { "expiring" , bucket . LabelName } } ,
Value : float32 ( count ) ,
} )
}
}
return flattenedResults , nil
}
2020-06-15 23:54:36 +00:00
// Callback function type to walk tokens referenced in the expiration
// manager. Don't want to use leaseEntry here because it's an unexported
// type (though most likely we would only call this from within the "vault" core package.)
type ExpirationWalkFunction = func ( leaseID string , auth * logical . Auth , path string ) bool
2021-04-08 16:43:39 +00:00
var ErrInRestoreMode = errors . New ( "expiration manager in restore mode" )
2020-06-23 23:36:24 +00:00
2020-06-15 23:54:36 +00:00
// WalkTokens extracts the Auth structure from leases corresponding to tokens.
// Returning false from the walk function terminates the iteration.
2020-06-23 23:36:24 +00:00
func ( m * ExpirationManager ) WalkTokens ( walkFn ExpirationWalkFunction ) error {
if m . inRestoreMode ( ) {
return ErrInRestoreMode
}
2020-06-15 23:54:36 +00:00
callback := func ( key , value interface { } ) bool {
p := value . ( pendingInfo )
if p . cachedLeaseInfo == nil {
return true
}
lease := p . cachedLeaseInfo
if lease . Auth != nil {
return walkFn ( key . ( string ) , lease . Auth , lease . Path )
}
return true
}
m . pending . Range ( callback )
m . nonexpiring . Range ( callback )
2020-06-23 23:36:24 +00:00
return nil
2020-06-15 23:54:36 +00:00
}
2020-11-13 18:26:58 +00:00
// leaseWalkFunction can only be used by the core package.
type leaseWalkFunction = func ( leaseID string , expireTime time . Time ) bool
func ( m * ExpirationManager ) walkLeases ( walkFn leaseWalkFunction ) error {
if m . inRestoreMode ( ) {
return ErrInRestoreMode
}
callback := func ( key , value interface { } ) bool {
p := value . ( pendingInfo )
if p . cachedLeaseInfo == nil {
return true
}
lease := p . cachedLeaseInfo
expireTime := lease . ExpireTime
return walkFn ( key . ( string ) , expireTime )
}
m . pending . Range ( callback )
m . nonexpiring . Range ( callback )
return nil
}
2021-04-29 15:12:02 +00:00
// must be called with m.pendingLock held
func ( m * ExpirationManager ) removeFromPending ( ctx context . Context , leaseID string ) {
if info , ok := m . pending . Load ( leaseID ) ; ok {
pending := info . ( pendingInfo )
pending . timer . Stop ( )
m . pending . Delete ( leaseID )
m . leaseCount --
// Log but do not fail; unit tests (and maybe Tidy on production systems)
if err := m . core . quotasHandleLeases ( ctx , quotas . LeaseActionDeleted , [ ] string { leaseID } ) ; err != nil {
m . logger . Error ( "failed to update quota on revocation" , "error" , err )
}
}
}
// note: must be called with pending lock held
func ( m * ExpirationManager ) markLeaseAsZombie ( ctx context . Context , le * leaseEntry , err error ) {
if le == nil {
m . logger . Warn ( "attempted to mark nil lease as zombie" )
return
}
if le . isZombie ( ) {
m . logger . Info ( "attempted to re-mark lease as zombie" , "original_error" , le . RevokeErr , "new_error" , err . Error ( ) )
return
}
errStr := err . Error ( )
if len ( errStr ) == 0 {
errStr = "no error message given"
}
if len ( errStr ) > maxZombieErrorLength {
errStr = errStr [ : maxZombieErrorLength ]
}
le . RevokeErr = errStr
m . persistEntry ( ctx , le )
m . zombies . Store ( le . LeaseID , le )
m . removeFromPending ( ctx , le . LeaseID )
m . nonexpiring . Delete ( le . LeaseID )
}
2015-03-13 17:55:54 +00:00
// leaseEntry is used to structure the values the expiration
// manager stores. This is used to handle renew and revocation.
type leaseEntry struct {
2016-01-04 21:43:07 +00:00
LeaseID string ` json:"lease_id" `
ClientToken string ` json:"client_token" `
2018-10-15 16:56:24 +00:00
ClientTokenType logical . TokenType ` json:"token_type" `
2016-01-04 21:43:07 +00:00
Path string ` json:"path" `
Data map [ string ] interface { } ` json:"data" `
Secret * logical . Secret ` json:"secret" `
Auth * logical . Auth ` json:"auth" `
IssueTime time . Time ` json:"issue_time" `
ExpireTime time . Time ` json:"expire_time" `
LastRenewalTime time . Time ` json:"last_renewal_time" `
2018-09-18 03:03:00 +00:00
2019-11-07 22:10:47 +00:00
// Version is used to track new different versions of leases. V0 (or
// zero-value) had non-root namespaced secondary indexes live in the root
// namespace, and V1 has secondary indexes live in the matching namespace.
Version int ` json:"version" `
2018-09-18 03:03:00 +00:00
namespace * namespace . Namespace
2021-04-29 15:12:02 +00:00
// RevokeErr tracks if a lease has failed revocation in a way that is
// unlikely to be automatically resolved. The first time this happens,
// RevokeErr will be set, thus marking this leaseEntry as a zombie that will
// have to be manually removed.
RevokeErr string ` json:"revokeErr" `
2015-03-13 17:55:54 +00:00
}
// encode is used to JSON encode the lease entry
2017-05-04 02:03:42 +00:00
func ( le * leaseEntry ) encode ( ) ( [ ] byte , error ) {
return json . Marshal ( le )
2015-03-13 17:55:54 +00:00
}
2017-05-04 02:03:42 +00:00
func ( le * leaseEntry ) renewable ( ) ( bool , error ) {
switch {
2018-10-15 16:56:24 +00:00
// If there is no entry, cannot review to renew
case le == nil :
return false , fmt . Errorf ( "lease not found" )
2021-04-29 15:12:02 +00:00
case le . isZombie ( ) :
return false , fmt . Errorf ( "lease is not renewable and has failed previous revocation attempts" )
2018-10-15 16:56:24 +00:00
case le . ExpireTime . IsZero ( ) :
return false , fmt . Errorf ( "lease is not renewable" )
case le . ClientTokenType == logical . TokenTypeBatch :
return false , nil
2015-04-09 21:23:37 +00:00
// Determine if the lease is expired
2017-05-04 02:03:42 +00:00
case le . ExpireTime . Before ( time . Now ( ) ) :
2018-10-15 16:56:24 +00:00
return false , fmt . Errorf ( "lease expired" )
2015-04-09 21:23:37 +00:00
// Determine if the lease is renewable
2017-05-04 02:03:42 +00:00
case le . Secret != nil && ! le . Secret . Renewable :
2018-10-15 16:56:24 +00:00
return false , fmt . Errorf ( "lease is not renewable" )
2017-05-04 02:03:42 +00:00
case le . Auth != nil && ! le . Auth . Renewable :
2018-10-15 16:56:24 +00:00
return false , fmt . Errorf ( "lease is not renewable" )
2015-04-09 21:23:37 +00:00
}
2017-05-04 02:03:42 +00:00
return true , nil
}
func ( le * leaseEntry ) ttl ( ) int64 {
return int64 ( le . ExpireTime . Sub ( time . Now ( ) . Round ( time . Second ) ) . Seconds ( ) )
2015-04-09 21:23:37 +00:00
}
2020-06-15 23:54:36 +00:00
func ( le * leaseEntry ) nonexpiringToken ( ) bool {
if le . Auth == nil {
return false
}
return ! le . Auth . LeaseEnabled ( )
}
2021-04-29 15:12:02 +00:00
// TODO maybe lock RevokeErr once this goes in: https://github.com/hashicorp/vault/pull/11122
func ( le * leaseEntry ) isZombie ( ) bool {
return le . RevokeErr != ""
}
2015-03-13 17:55:54 +00:00
// decodeLeaseEntry is used to reverse encode and return a new entry
func decodeLeaseEntry ( buf [ ] byte ) ( * leaseEntry , error ) {
out := new ( leaseEntry )
2016-07-06 16:25:40 +00:00
return out , jsonutil . DecodeJSON ( buf , out )
2015-03-13 01:38:15 +00:00
}