backport of commit dbfaa6f81a156ec1dcb85d8d76941d4ac70c91fb (#23285)

Co-authored-by: Steven Clark <steven.clark@hashicorp.com>
This commit is contained in:
hc-github-team-secure-vault-core 2023-09-26 14:18:06 -04:00 committed by GitHub
parent ab1f3c8b83
commit c08d731634
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 51 additions and 14 deletions

View File

@ -10,6 +10,7 @@ import (
"sync"
"time"
"github.com/hashicorp/vault/sdk/helper/consts"
"github.com/hashicorp/vault/sdk/logical"
)
@ -42,6 +43,7 @@ type ChallengeValidation struct {
type ChallengeQueueEntry struct {
Identifier string
RetryAfter time.Time
NumRetries int // Track if we are spinning on a corrupted challenge
}
type ACMEChallengeEngine struct {
@ -97,7 +99,7 @@ func (ace *ACMEChallengeEngine) Run(b *backend, state *acmeState, sc *storageCon
b.Logger().Error("failed loading existing ACME challenge validations:", "err", err)
}
for true {
for {
// err == nil on shutdown.
b.Logger().Debug("Starting ACME challenge validation engine")
err := ace._run(b, state)
@ -119,7 +121,7 @@ func (ace *ACMEChallengeEngine) _run(b *backend, state *acmeState) error {
// We want at most a certain number of workers operating to verify
// challenges.
var finishedWorkersChannels []chan bool
for true {
for {
// Wait until we've got more work to do.
select {
case <-ace.Closing:
@ -201,12 +203,17 @@ func (ace *ACMEChallengeEngine) _run(b *backend, state *acmeState) error {
// looping through the queue until we hit a repeat.
firstIdentifier = ""
// If we are no longer the active node, break out
if b.System().ReplicationState().HasState(consts.ReplicationDRSecondary | consts.ReplicationPerformanceStandby) {
break
}
// Here, we got a piece of work that is ready to check; create a
// channel and a new go routine and run it. Note that this still
// could have a RetryAfter date we're not aware of (e.g., if the
// cluster restarted as we do not read the entries there).
channel := make(chan bool, 1)
go ace.VerifyChallenge(runnerSC, task.Identifier, channel, config)
go ace.VerifyChallenge(runnerSC, task.Identifier, task.NumRetries, channel, config)
finishedWorkersChannels = append(finishedWorkersChannels, channel)
startedWork = true
}
@ -305,8 +312,9 @@ func (ace *ACMEChallengeEngine) AcceptChallenge(sc *storageContext, account stri
return nil
}
func (ace *ACMEChallengeEngine) VerifyChallenge(runnerSc *storageContext, id string, finished chan bool, config *acmeConfigEntry) {
sc, _ /* cancel func */ := runnerSc.WithFreshTimeout(MaxChallengeTimeout)
func (ace *ACMEChallengeEngine) VerifyChallenge(runnerSc *storageContext, id string, validationQueueRetries int, finished chan bool, config *acmeConfigEntry) {
sc, cancel := runnerSc.WithFreshTimeout(MaxChallengeTimeout)
defer cancel()
runnerSc.Backend.Logger().Debug("Starting verification of challenge", "id", id)
if retry, retryAfter, err := ace._verifyChallenge(sc, id, config); err != nil {
@ -316,11 +324,28 @@ func (ace *ACMEChallengeEngine) VerifyChallenge(runnerSc *storageContext, id str
sc.Backend.Logger().Error(fmt.Sprintf("ACME validation failed for %v: %v", id, err))
if retry {
validationQueueRetries++
// The retry logic within _verifyChallenge is dependent on being able to read and decode
// the ACME challenge entries. If we encounter such failures we would retry forever, so
// we have a secondary check here to see if we are consistently looping within the validation
// queue that is larger than the normal retry attempts we would allow.
if validationQueueRetries > MaxRetryAttempts*2 {
sc.Backend.Logger().Warn("reached max error attempts within challenge queue: %v, giving up", id)
_, _, err = ace._verifyChallengeCleanup(sc, nil, id)
if err != nil {
sc.Backend.Logger().Warn("Failed cleaning up challenge entry: %v", err)
}
finished <- true
return
}
ace.ValidationLock.Lock()
defer ace.ValidationLock.Unlock()
ace.Validations.PushBack(&ChallengeQueueEntry{
Identifier: id,
RetryAfter: retryAfter,
NumRetries: validationQueueRetries,
})
// Let the validator know there's a pending challenge.
@ -343,23 +368,23 @@ func (ace *ACMEChallengeEngine) VerifyChallenge(runnerSc *storageContext, id str
func (ace *ACMEChallengeEngine) _verifyChallenge(sc *storageContext, id string, config *acmeConfigEntry) (bool, time.Time, error) {
now := time.Now()
backoffTime := now.Add(1 * time.Second)
path := acmeValidationPrefix + id
challengeEntry, err := sc.Storage.Get(sc.Context, path)
if err != nil {
return true, now, fmt.Errorf("error loading challenge %v: %w", id, err)
return true, backoffTime, fmt.Errorf("error loading challenge %v: %w", id, err)
}
if challengeEntry == nil {
// Something must've successfully cleaned up our storage entry from
// under us. Assume we don't need to rerun, else the client will
// trigger us to re-run.
err = nil
return ace._verifyChallengeCleanup(sc, err, id)
return ace._verifyChallengeCleanup(sc, nil, id)
}
var cv *ChallengeValidation
if err := challengeEntry.DecodeJSON(&cv); err != nil {
return true, now, fmt.Errorf("error decoding challenge %v: %w", id, err)
return true, backoffTime, fmt.Errorf("error decoding challenge %v: %w", id, err)
}
if now.Before(cv.RetryAfter) {
@ -369,7 +394,7 @@ func (ace *ACMEChallengeEngine) _verifyChallenge(sc *storageContext, id string,
authzPath := getAuthorizationPath(cv.Account, cv.Authorization)
authz, err := loadAuthorizationAtPath(sc, authzPath)
if err != nil {
return true, now, fmt.Errorf("error loading authorization %v/%v for challenge %v: %w", cv.Account, cv.Authorization, id, err)
return true, backoffTime, fmt.Errorf("error loading authorization %v/%v for challenge %v: %w", cv.Account, cv.Authorization, id, err)
}
if authz.Status != ACMEAuthorizationPending {
@ -527,7 +552,7 @@ func (ace *ACMEChallengeEngine) _verifyChallengeCleanup(sc *storageContext, err
// Remove our ChallengeValidation entry only.
if deleteErr := sc.Storage.Delete(sc.Context, acmeValidationPrefix+id); deleteErr != nil {
return true, now.Add(-1 * time.Second), fmt.Errorf("error deleting challenge %v (error prior to cleanup, if any: %v): %w", id, err, deleteErr)
return true, now.Add(1 * time.Second), fmt.Errorf("error deleting challenge %v (error prior to cleanup, if any: %v): %w", id, err, deleteErr)
}
if err != nil {

View File

@ -18,6 +18,7 @@ import (
"github.com/hashicorp/go-secure-stdlib/nonceutil"
"github.com/hashicorp/vault/sdk/framework"
"github.com/hashicorp/vault/sdk/helper/consts"
"github.com/hashicorp/vault/sdk/logical"
)
@ -76,6 +77,13 @@ func (a *acmeState) Initialize(b *backend, sc *storageContext) error {
return fmt.Errorf("error initializing ACME engine: %w", err)
}
if b.System().ReplicationState().HasState(consts.ReplicationDRSecondary | consts.ReplicationPerformanceStandby) {
// It is assumed, that if the node does become the active node later
// the plugin is re-initialized, so this is safe. It also spares the node
// from loading the existing queue into memory for no reason.
b.Logger().Debug("Not on an active node, skipping starting ACME challenge validation engine")
return nil
}
// Kick off our ACME challenge validation engine.
go a.validator.Run(b, a, sc)

View File

@ -64,9 +64,10 @@ func (ts *TestServer) setupRunner(domain string, network string) {
ContainerName: "bind9-dns-" + strings.ReplaceAll(domain, ".", "-"),
NetworkName: network,
Ports: []string{"53/udp"},
LogConsumer: func(s string) {
ts.log.Info(s)
},
// DNS container logging was disabled to reduce content within CI logs.
//LogConsumer: func(s string) {
// ts.log.Info(s)
//},
})
require.NoError(ts.t, err)
}

3
changelog/23278.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
secrets/pki: Stop processing in-flight ACME verifications when an active node steps down
```