2023-04-19 16:31:19 +00:00
package pki
import (
"container/list"
"context"
"fmt"
"sync"
"time"
"github.com/hashicorp/vault/sdk/logical"
)
var MaxChallengeTimeout = 1 * time . Minute
const MaxRetryAttempts = 5
2023-06-08 13:25:07 +00:00
const ChallengeAttemptFailedMsg = "this may occur if the validation target was misconfigured: check that challenge responses are available at the required locations and retry."
2023-04-19 16:31:19 +00:00
type ChallengeValidation struct {
// Account KID that this validation attempt is recorded under.
Account string ` json:"account" `
// The authorization ID that this validation attempt is for.
Authorization string ` json:"authorization" `
ChallengeType ACMEChallengeType ` json:"challenge_type" `
// The token of this challenge and the JWS thumbprint of the account
// we're validating against.
Token string ` json:"token" `
Thumbprint string ` json:"thumbprint" `
Initiated time . Time ` json:"initiated" `
FirstValidation time . Time ` json:"first_validation,omitempty" `
RetryCount int ` json:"retry_count,omitempty" `
LastRetry time . Time ` json:"last_retry,omitempty" `
RetryAfter time . Time ` json:"retry_after,omitempty" `
}
2023-04-21 17:08:27 +00:00
type ChallengeQueueEntry struct {
Identifier string
RetryAfter time . Time
}
2023-04-19 16:31:19 +00:00
type ACMEChallengeEngine struct {
NumWorkers int
ValidationLock sync . Mutex
NewValidation chan string
Closing chan struct { }
Validations * list . List
}
func NewACMEChallengeEngine ( ) * ACMEChallengeEngine {
ace := & ACMEChallengeEngine { }
ace . NewValidation = make ( chan string , 1 )
ace . Closing = make ( chan struct { } , 1 )
ace . Validations = list . New ( )
ace . NumWorkers = 5
return ace
}
func ( ace * ACMEChallengeEngine ) LoadFromStorage ( b * backend , sc * storageContext ) error {
items , err := sc . Storage . List ( sc . Context , acmeValidationPrefix )
if err != nil {
return fmt . Errorf ( "failed loading list of validations from disk: %w" , err )
}
ace . ValidationLock . Lock ( )
defer ace . ValidationLock . Unlock ( )
// Add them to our queue of validations to work through later.
2023-06-12 18:42:54 +00:00
foundExistingValidations := false
2023-04-19 16:31:19 +00:00
for _ , item := range items {
2023-04-21 17:08:27 +00:00
ace . Validations . PushBack ( & ChallengeQueueEntry {
Identifier : item ,
} )
2023-06-12 18:42:54 +00:00
foundExistingValidations = true
}
if foundExistingValidations {
ace . NewValidation <- "existing"
2023-04-19 16:31:19 +00:00
}
return nil
}
2023-06-12 18:42:54 +00:00
func ( ace * ACMEChallengeEngine ) Run ( b * backend , state * acmeState , sc * storageContext ) {
// We load the existing ACME challenges within the Run thread to avoid
// delaying the PKI mount initialization
b . Logger ( ) . Debug ( "Loading existing challenge validations on disk" )
err := ace . LoadFromStorage ( b , sc )
if err != nil {
b . Logger ( ) . Error ( "failed loading existing ACME challenge validations:" , "err" , err )
}
2023-04-19 16:31:19 +00:00
for true {
// err == nil on shutdown.
b . Logger ( ) . Debug ( "Starting ACME challenge validation engine" )
2023-04-27 19:30:29 +00:00
err := ace . _run ( b , state )
2023-04-19 16:31:19 +00:00
if err != nil {
b . Logger ( ) . Error ( "Got unexpected error from ACME challenge validation engine" , "err" , err )
time . Sleep ( 1 * time . Second )
continue
}
break
}
}
2023-04-27 19:30:29 +00:00
func ( ace * ACMEChallengeEngine ) _run ( b * backend , state * acmeState ) error {
2023-04-19 16:31:19 +00:00
// This runner uses a background context for storage operations: we don't
// want to tie it to a inbound request and we don't want to set a time
// limit, so create a fresh background context.
runnerSC := b . makeStorageContext ( context . Background ( ) , b . storage )
// We want at most a certain number of workers operating to verify
// challenges.
var finishedWorkersChannels [ ] chan bool
for true {
// Wait until we've got more work to do.
select {
case <- ace . Closing :
b . Logger ( ) . Debug ( "shutting down ACME challenge validation engine" )
return nil
case <- ace . NewValidation :
}
// First try to reap any finished workers. Read from their channels
// and if not finished yet, add to a fresh slice.
var newFinishedWorkersChannels [ ] chan bool
for _ , channel := range finishedWorkersChannels {
select {
case <- channel :
default :
// This channel had not been written to, indicating that the
// worker had not yet finished.
newFinishedWorkersChannels = append ( newFinishedWorkersChannels , channel )
}
}
finishedWorkersChannels = newFinishedWorkersChannels
// If we have space to take on another work item, do so.
2023-04-21 17:08:27 +00:00
firstIdentifier := ""
startedWork := false
now := time . Now ( )
for len ( finishedWorkersChannels ) < ace . NumWorkers {
var task * ChallengeQueueEntry
// Find our next work item. We do all of these operations
// while holding the queue lock, hence some repeated checks
// afterwards. Out of this, we get a candidate task, using
// element == nil as a sentinel for breaking our parent
// loop.
2023-04-19 16:31:19 +00:00
ace . ValidationLock . Lock ( )
element := ace . Validations . Front ( )
if element != nil {
ace . Validations . Remove ( element )
2023-04-21 17:08:27 +00:00
task = element . Value . ( * ChallengeQueueEntry )
if ! task . RetryAfter . IsZero ( ) && now . Before ( task . RetryAfter ) {
// We cannot work on this element yet; remove it to
// the back of the queue. This allows us to potentially
// select the next item in the next iteration.
ace . Validations . PushBack ( task )
}
if firstIdentifier != "" && task . Identifier == firstIdentifier {
// We found and rejected this element before; exit the
// loop by "claiming" we didn't find any work.
element = nil
} else if firstIdentifier == "" {
firstIdentifier = task . Identifier
}
2023-04-19 16:31:19 +00:00
}
ace . ValidationLock . Unlock ( )
2023-04-21 17:08:27 +00:00
if element == nil {
// There was no more work to do to fill up the queue; exit
// this loop.
break
}
if now . Before ( task . RetryAfter ) {
// Here, while we found an element, we didn't want to
// completely exit the loop (perhaps it was our first time
// finding a work order), so retry without modifying
// firstIdentifier.
continue
}
2023-04-19 16:31:19 +00:00
2023-04-27 19:30:29 +00:00
config , err := state . getConfigWithUpdate ( runnerSC )
if err != nil {
return fmt . Errorf ( "failed fetching ACME configuration: %w" , err )
}
2023-04-21 17:08:27 +00:00
// Since this work item was valid, we won't expect to see it in
// the validation queue again until it is executed. Here, we
// want to avoid infinite looping above (if we removed the one
// valid item and the remainder are all not immediately
// actionable). At the worst, we'll spend a little more time
// looping through the queue until we hit a repeat.
firstIdentifier = ""
// Here, we got a piece of work that is ready to check; create a
// channel and a new go routine and run it. Note that this still
// could have a RetryAfter date we're not aware of (e.g., if the
// cluster restarted as we do not read the entries there).
2023-04-19 16:31:19 +00:00
channel := make ( chan bool , 1 )
2023-04-27 19:30:29 +00:00
go ace . VerifyChallenge ( runnerSC , task . Identifier , channel , config )
2023-04-21 17:08:27 +00:00
finishedWorkersChannels = append ( finishedWorkersChannels , channel )
startedWork = true
2023-04-19 16:31:19 +00:00
}
2023-04-21 17:08:27 +00:00
// If we have no more capacity for work, we should pause a little to
// let the system catch up. Additionally, if we only had
// non-actionable work items, we should pause until some time has
// elapsed: not too much that we potentially starve any new incoming
// items from validation, but not too short that we cause a busy loop.
if len ( finishedWorkersChannels ) == ace . NumWorkers || ! startedWork {
time . Sleep ( 100 * time . Millisecond )
2023-04-19 16:31:19 +00:00
}
// Lastly, if we have more work to do, re-trigger ourselves.
ace . ValidationLock . Lock ( )
if ace . Validations . Front ( ) != nil {
select {
case ace . NewValidation <- "retry" :
default :
}
}
ace . ValidationLock . Unlock ( )
}
return fmt . Errorf ( "unexpectedly exited from ACMEChallengeEngine._run()" )
}
func ( ace * ACMEChallengeEngine ) AcceptChallenge ( sc * storageContext , account string , authz * ACMEAuthorization , challenge * ACMEChallenge , thumbprint string ) error {
name := authz . Id + "-" + string ( challenge . Type )
path := acmeValidationPrefix + name
entry , err := sc . Storage . Get ( sc . Context , path )
if err == nil && entry != nil {
// Challenge already in the queue; exit without re-adding it.
return nil
}
if authz . Status != ACMEAuthorizationPending {
2023-06-01 19:57:41 +00:00
return fmt . Errorf ( "%w: cannot accept already validated authorization %v (%v)" , ErrMalformed , authz . Id , authz . Status )
2023-04-19 16:31:19 +00:00
}
2023-06-01 19:57:41 +00:00
for _ , otherChallenge := range authz . Challenges {
// We assume within an authorization we won't have multiple challenges of the same challenge type
// and we want to limit a single challenge being in a processing state to avoid race conditions
// failing one challenge and passing another.
if otherChallenge . Type != challenge . Type && otherChallenge . Status != ACMEChallengePending {
return fmt . Errorf ( "%w: only a single challenge within an authorization can be accepted (%v) in status %v" , ErrMalformed , otherChallenge . Type , otherChallenge . Status )
}
// The requested challenge can ping us to wake us up, so allow pending and currently processing statuses
if otherChallenge . Status != ACMEChallengePending && otherChallenge . Status != ACMEChallengeProcessing {
return fmt . Errorf ( "%w: challenge is in invalid state (%v) in authorization %v" , ErrMalformed , challenge . Status , authz . Id )
}
2023-04-19 16:31:19 +00:00
}
token := challenge . ChallengeFields [ "token" ] . ( string )
cv := & ChallengeValidation {
Account : account ,
Authorization : authz . Id ,
ChallengeType : challenge . Type ,
Token : token ,
Thumbprint : thumbprint ,
Initiated : time . Now ( ) ,
}
json , err := logical . StorageEntryJSON ( path , & cv )
if err != nil {
return fmt . Errorf ( "error creating challenge validation queue entry: %w" , err )
}
if err := sc . Storage . Put ( sc . Context , json ) ; err != nil {
return fmt . Errorf ( "error writing challenge validation entry: %w" , err )
}
if challenge . Status == ACMEChallengePending {
challenge . Status = ACMEChallengeProcessing
authzPath := getAuthorizationPath ( account , authz . Id )
if err := saveAuthorizationAtPath ( sc , authzPath , authz ) ; err != nil {
return fmt . Errorf ( "error saving updated authorization %v: %w" , authz . Id , err )
}
}
ace . ValidationLock . Lock ( )
defer ace . ValidationLock . Unlock ( )
2023-04-21 17:08:27 +00:00
ace . Validations . PushBack ( & ChallengeQueueEntry {
Identifier : name ,
} )
2023-04-19 16:31:19 +00:00
select {
case ace . NewValidation <- name :
default :
}
return nil
}
2023-04-27 19:30:29 +00:00
func ( ace * ACMEChallengeEngine ) VerifyChallenge ( runnerSc * storageContext , id string , finished chan bool , config * acmeConfigEntry ) {
2023-04-19 16:31:19 +00:00
sc , _ /* cancel func */ := runnerSc . WithFreshTimeout ( MaxChallengeTimeout )
2023-04-27 20:05:23 +00:00
runnerSc . Backend . Logger ( ) . Debug ( "Starting verification of challenge" , "id" , id )
2023-04-19 16:31:19 +00:00
2023-04-27 19:30:29 +00:00
if retry , retryAfter , err := ace . _verifyChallenge ( sc , id , config ) ; err != nil {
2023-04-19 16:31:19 +00:00
// Because verification of this challenge failed, we need to retry
// it in the future. Log the error and re-add the item to the queue
// to try again later.
sc . Backend . Logger ( ) . Error ( fmt . Sprintf ( "ACME validation failed for %v: %v" , id , err ) )
if retry {
ace . ValidationLock . Lock ( )
defer ace . ValidationLock . Unlock ( )
2023-04-21 17:08:27 +00:00
ace . Validations . PushBack ( & ChallengeQueueEntry {
Identifier : id ,
RetryAfter : retryAfter ,
} )
2023-04-19 16:31:19 +00:00
// Let the validator know there's a pending challenge.
select {
case ace . NewValidation <- id :
default :
}
}
// We're the only producer on this channel and it has a buffer size
// of one element, so it is safe to directly write here.
finished <- true
return
}
// We're the only producer on this channel and it has a buffer size of one
// element, so it is safe to directly write here.
finished <- false
}
2023-04-27 19:30:29 +00:00
func ( ace * ACMEChallengeEngine ) _verifyChallenge ( sc * storageContext , id string , config * acmeConfigEntry ) ( bool , time . Time , error ) {
2023-04-19 16:31:19 +00:00
now := time . Now ( )
path := acmeValidationPrefix + id
challengeEntry , err := sc . Storage . Get ( sc . Context , path )
if err != nil {
2023-04-21 17:08:27 +00:00
return true , now , fmt . Errorf ( "error loading challenge %v: %w" , id , err )
2023-04-19 16:31:19 +00:00
}
if challengeEntry == nil {
// Something must've successfully cleaned up our storage entry from
// under us. Assume we don't need to rerun, else the client will
// trigger us to re-run.
err = nil
return ace . _verifyChallengeCleanup ( sc , err , id )
}
var cv * ChallengeValidation
if err := challengeEntry . DecodeJSON ( & cv ) ; err != nil {
2023-04-21 17:08:27 +00:00
return true , now , fmt . Errorf ( "error decoding challenge %v: %w" , id , err )
2023-04-19 16:31:19 +00:00
}
if now . Before ( cv . RetryAfter ) {
2023-04-21 17:08:27 +00:00
return true , cv . RetryAfter , fmt . Errorf ( "retrying challenge %v too soon" , id )
2023-04-19 16:31:19 +00:00
}
authzPath := getAuthorizationPath ( cv . Account , cv . Authorization )
authz , err := loadAuthorizationAtPath ( sc , authzPath )
if err != nil {
2023-04-21 17:08:27 +00:00
return true , now , fmt . Errorf ( "error loading authorization %v/%v for challenge %v: %w" , cv . Account , cv . Authorization , id , err )
2023-04-19 16:31:19 +00:00
}
if authz . Status != ACMEAuthorizationPending {
// Something must've finished up this challenge for us. Assume we
// don't need to rerun and exit instead.
err = nil
return ace . _verifyChallengeCleanup ( sc , err , id )
}
var challenge * ACMEChallenge
for _ , authzChallenge := range authz . Challenges {
if authzChallenge . Type == cv . ChallengeType {
challenge = authzChallenge
break
}
}
if challenge == nil {
err = fmt . Errorf ( "no challenge of type %v in authorization %v/%v for challenge %v" , cv . ChallengeType , cv . Account , cv . Authorization , id )
return ace . _verifyChallengeCleanup ( sc , err , id )
}
if challenge . Status != ACMEChallengePending && challenge . Status != ACMEChallengeProcessing {
err = fmt . Errorf ( "challenge is in invalid state %v in authorization %v/%v for challenge %v" , challenge . Status , cv . Account , cv . Authorization , id )
return ace . _verifyChallengeCleanup ( sc , err , id )
}
var valid bool
switch challenge . Type {
case ACMEHTTPChallenge :
if authz . Identifier . Type != ACMEDNSIdentifier && authz . Identifier . Type != ACMEIPIdentifier {
err = fmt . Errorf ( "unsupported identifier type for authorization %v/%v in challenge %v: %v" , cv . Account , cv . Authorization , id , authz . Identifier . Type )
return ace . _verifyChallengeCleanup ( sc , err , id )
}
if authz . Wildcard {
err = fmt . Errorf ( "unable to validate wildcard authorization %v/%v in challenge %v via http-01 challenge" , cv . Account , cv . Authorization , id )
return ace . _verifyChallengeCleanup ( sc , err , id )
}
2023-04-27 19:30:29 +00:00
valid , err = ValidateHTTP01Challenge ( authz . Identifier . Value , cv . Token , cv . Thumbprint , config )
2023-04-19 16:31:19 +00:00
if err != nil {
2023-06-08 13:25:07 +00:00
err = fmt . Errorf ( "%w: error validating http-01 challenge %v: %v; %v" , ErrIncorrectResponse , id , err , ChallengeAttemptFailedMsg )
2023-06-01 19:57:41 +00:00
return ace . _verifyChallengeRetry ( sc , cv , authzPath , authz , challenge , err , id )
2023-04-19 16:31:19 +00:00
}
2023-04-21 17:08:27 +00:00
case ACMEDNSChallenge :
if authz . Identifier . Type != ACMEDNSIdentifier {
err = fmt . Errorf ( "unsupported identifier type for authorization %v/%v in challenge %v: %v" , cv . Account , cv . Authorization , id , authz . Identifier . Type )
return ace . _verifyChallengeCleanup ( sc , err , id )
}
2023-04-27 19:30:29 +00:00
valid , err = ValidateDNS01Challenge ( authz . Identifier . Value , cv . Token , cv . Thumbprint , config )
2023-04-21 17:08:27 +00:00
if err != nil {
2023-06-08 13:25:07 +00:00
err = fmt . Errorf ( "%w: error validating dns-01 challenge %v: %v; %v" , ErrIncorrectResponse , id , err , ChallengeAttemptFailedMsg )
2023-06-01 19:57:41 +00:00
return ace . _verifyChallengeRetry ( sc , cv , authzPath , authz , challenge , err , id )
2023-04-21 17:08:27 +00:00
}
2023-06-07 21:32:58 +00:00
case ACMEALPNChallenge :
if authz . Identifier . Type != ACMEDNSIdentifier {
err = fmt . Errorf ( "unsupported identifier type for authorization %v/%v in challenge %v: %v" , cv . Account , cv . Authorization , id , authz . Identifier . Type )
return ace . _verifyChallengeCleanup ( sc , err , id )
}
if authz . Wildcard {
err = fmt . Errorf ( "unable to validate wildcard authorization %v/%v in challenge %v via tls-alpn-01 challenge" , cv . Account , cv . Authorization , id )
return ace . _verifyChallengeCleanup ( sc , err , id )
}
valid , err = ValidateTLSALPN01Challenge ( authz . Identifier . Value , cv . Token , cv . Thumbprint , config )
if err != nil {
err = fmt . Errorf ( "%w: error validating tls-alpn-01 challenge %v: %s" , ErrIncorrectResponse , id , err . Error ( ) )
return ace . _verifyChallengeRetry ( sc , cv , authzPath , authz , challenge , err , id )
}
2023-04-19 16:31:19 +00:00
default :
err = fmt . Errorf ( "unsupported ACME challenge type %v for challenge %v" , cv . ChallengeType , id )
return ace . _verifyChallengeCleanup ( sc , err , id )
}
if ! valid {
2023-06-01 19:57:41 +00:00
err = fmt . Errorf ( "%w: challenge failed with no additional information" , ErrIncorrectResponse )
return ace . _verifyChallengeRetry ( sc , cv , authzPath , authz , challenge , err , id )
2023-04-19 16:31:19 +00:00
}
// If we got here, the challenge verification was successful. Update
// the authorization appropriately.
expires := now . Add ( 15 * 24 * time . Hour )
challenge . Status = ACMEChallengeValid
challenge . Validated = now . Format ( time . RFC3339 )
2023-06-01 19:57:41 +00:00
challenge . Error = nil
2023-04-19 16:31:19 +00:00
authz . Status = ACMEAuthorizationValid
authz . Expires = expires . Format ( time . RFC3339 )
if err := saveAuthorizationAtPath ( sc , authzPath , authz ) ; err != nil {
err = fmt . Errorf ( "error saving updated (validated) authorization %v/%v for challenge %v: %w" , cv . Account , cv . Authorization , id , err )
2023-06-01 19:57:41 +00:00
return ace . _verifyChallengeRetry ( sc , cv , authzPath , authz , challenge , err , id )
2023-04-19 16:31:19 +00:00
}
return ace . _verifyChallengeCleanup ( sc , nil , id )
}
2023-06-01 19:57:41 +00:00
func ( ace * ACMEChallengeEngine ) _verifyChallengeRetry ( sc * storageContext , cv * ChallengeValidation , authzPath string , auth * ACMEAuthorization , challenge * ACMEChallenge , verificationErr error , id string ) ( bool , time . Time , error ) {
2023-04-19 16:31:19 +00:00
now := time . Now ( )
path := acmeValidationPrefix + id
2023-06-01 19:57:41 +00:00
if err := updateChallengeStatus ( sc , cv , authzPath , auth , challenge , verificationErr ) ; err != nil {
return true , now , err
}
2023-04-19 16:31:19 +00:00
if cv . RetryCount > MaxRetryAttempts {
2023-06-01 19:57:41 +00:00
err := fmt . Errorf ( "reached max error attempts for challenge %v: %w" , id , verificationErr )
2023-04-19 16:31:19 +00:00
return ace . _verifyChallengeCleanup ( sc , err , id )
}
if cv . FirstValidation . IsZero ( ) {
cv . FirstValidation = now
}
cv . RetryCount += 1
cv . LastRetry = now
cv . RetryAfter = now . Add ( time . Duration ( cv . RetryCount * 5 ) * time . Second )
json , jsonErr := logical . StorageEntryJSON ( path , cv )
if jsonErr != nil {
2023-06-01 19:57:41 +00:00
return true , now , fmt . Errorf ( "error persisting updated challenge validation queue entry (error prior to retry, if any: %v): %w" , verificationErr , jsonErr )
2023-04-19 16:31:19 +00:00
}
if putErr := sc . Storage . Put ( sc . Context , json ) ; putErr != nil {
2023-06-01 19:57:41 +00:00
return true , now , fmt . Errorf ( "error writing updated challenge validation entry (error prior to retry, if any: %v): %w" , verificationErr , putErr )
2023-04-19 16:31:19 +00:00
}
2023-06-01 19:57:41 +00:00
if verificationErr != nil {
verificationErr = fmt . Errorf ( "retrying validation: %w" , verificationErr )
}
return true , cv . RetryAfter , verificationErr
}
func updateChallengeStatus ( sc * storageContext , cv * ChallengeValidation , authzPath string , auth * ACMEAuthorization , challenge * ACMEChallenge , verificationErr error ) error {
if verificationErr != nil {
challengeError := TranslateErrorToErrorResponse ( verificationErr )
challenge . Error = challengeError . MarshalForStorage ( )
}
if cv . RetryCount > MaxRetryAttempts {
challenge . Status = ACMEChallengeInvalid
auth . Status = ACMEAuthorizationInvalid
2023-04-19 16:31:19 +00:00
}
2023-06-01 19:57:41 +00:00
if err := saveAuthorizationAtPath ( sc , authzPath , auth ) ; err != nil {
return fmt . Errorf ( "error persisting authorization/challenge update: %w" , err )
}
return nil
2023-04-19 16:31:19 +00:00
}
2023-04-21 17:08:27 +00:00
func ( ace * ACMEChallengeEngine ) _verifyChallengeCleanup ( sc * storageContext , err error , id string ) ( bool , time . Time , error ) {
now := time . Now ( )
2023-04-19 16:31:19 +00:00
// Remove our ChallengeValidation entry only.
if deleteErr := sc . Storage . Delete ( sc . Context , acmeValidationPrefix + id ) ; deleteErr != nil {
2023-04-21 17:08:27 +00:00
return true , now . Add ( - 1 * time . Second ) , fmt . Errorf ( "error deleting challenge %v (error prior to cleanup, if any: %v): %w" , id , err , deleteErr )
2023-04-19 16:31:19 +00:00
}
if err != nil {
err = fmt . Errorf ( "removing challenge validation attempt and not retrying %v; previous error: %w" , id , err )
}
2023-04-21 17:08:27 +00:00
return false , now , err
2023-04-19 16:31:19 +00:00
}