Add ACME DNS validator (#20293)

* Add DNS challenge validation to ACME

This allows us to validate against wildcard domain names, as the HTTP-01
challenge does not support wildcard validation.

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

* Properly delay challenge retry attempts

Previously we'd essentially semi-busy wait for a challenge to become
retry-able, as the queue itself had no knowledge of the retry after
value of the validation attempt. Now, we plumb through this value into
the queue itself, to aide selection of validations to attempt.

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

---------

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>
This commit is contained in:
Alexander Scheel 2023-04-21 13:08:27 -04:00 committed by GitHub
parent c5d10e0b8b
commit dfb29f3b96
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 148 additions and 23 deletions

View File

@ -34,6 +34,11 @@ type ChallengeValidation struct {
RetryAfter time.Time `json:"retry_after,omitempty"`
}
type ChallengeQueueEntry struct {
Identifier string
RetryAfter time.Time
}
type ACMEChallengeEngine struct {
NumWorkers int
@ -72,7 +77,9 @@ func (ace *ACMEChallengeEngine) LoadFromStorage(b *backend, sc *storageContext)
// Add them to our queue of validations to work through later.
for _, item := range items {
ace.Validations.PushBack(item)
ace.Validations.PushBack(&ChallengeQueueEntry{
Identifier: item,
})
}
return nil
@ -125,22 +132,76 @@ func (ace *ACMEChallengeEngine) _run(b *backend) error {
finishedWorkersChannels = newFinishedWorkersChannels
// If we have space to take on another work item, do so.
if len(finishedWorkersChannels) < ace.NumWorkers {
firstIdentifier := ""
startedWork := false
now := time.Now()
for len(finishedWorkersChannels) < ace.NumWorkers {
var task *ChallengeQueueEntry
// Find our next work item. We do all of these operations
// while holding the queue lock, hence some repeated checks
// afterwards. Out of this, we get a candidate task, using
// element == nil as a sentinel for breaking our parent
// loop.
ace.ValidationLock.Lock()
element := ace.Validations.Front()
if element != nil {
ace.Validations.Remove(element)
task = element.Value.(*ChallengeQueueEntry)
if !task.RetryAfter.IsZero() && now.Before(task.RetryAfter) {
// We cannot work on this element yet; remove it to
// the back of the queue. This allows us to potentially
// select the next item in the next iteration.
ace.Validations.PushBack(task)
}
if firstIdentifier != "" && task.Identifier == firstIdentifier {
// We found and rejected this element before; exit the
// loop by "claiming" we didn't find any work.
element = nil
} else if firstIdentifier == "" {
firstIdentifier = task.Identifier
}
}
ace.ValidationLock.Unlock()
if element == nil {
// There was no more work to do to fill up the queue; exit
// this loop.
break
}
if now.Before(task.RetryAfter) {
// Here, while we found an element, we didn't want to
// completely exit the loop (perhaps it was our first time
// finding a work order), so retry without modifying
// firstIdentifier.
continue
}
task := element.Value.(string)
// Since this work item was valid, we won't expect to see it in
// the validation queue again until it is executed. Here, we
// want to avoid infinite looping above (if we removed the one
// valid item and the remainder are all not immediately
// actionable). At the worst, we'll spend a little more time
// looping through the queue until we hit a repeat.
firstIdentifier = ""
// Here, we got a piece of work that is ready to check; create a
// channel and a new go routine and run it. Note that this still
// could have a RetryAfter date we're not aware of (e.g., if the
// cluster restarted as we do not read the entries there).
channel := make(chan bool, 1)
go ace.VerifyChallenge(runnerSC, task, channel)
go ace.VerifyChallenge(runnerSC, task.Identifier, channel)
finishedWorkersChannels = append(finishedWorkersChannels, channel)
startedWork = true
}
// If we have no more work.
if len(finishedWorkersChannels) == ace.NumWorkers {
time.Sleep(50 * time.Millisecond)
// If we have no more capacity for work, we should pause a little to
// let the system catch up. Additionally, if we only had
// non-actionable work items, we should pause until some time has
// elapsed: not too much that we potentially starve any new incoming
// items from validation, but not too short that we cause a busy loop.
if len(finishedWorkersChannels) == ace.NumWorkers || !startedWork {
time.Sleep(100 * time.Millisecond)
}
// Lastly, if we have more work to do, re-trigger ourselves.
@ -206,7 +267,9 @@ func (ace *ACMEChallengeEngine) AcceptChallenge(sc *storageContext, account stri
ace.ValidationLock.Lock()
defer ace.ValidationLock.Unlock()
ace.Validations.PushBack(name)
ace.Validations.PushBack(&ChallengeQueueEntry{
Identifier: name,
})
select {
case ace.NewValidation <- name:
@ -220,7 +283,7 @@ func (ace *ACMEChallengeEngine) VerifyChallenge(runnerSc *storageContext, id str
sc, _ /* cancel func */ := runnerSc.WithFreshTimeout(MaxChallengeTimeout)
runnerSc.Backend.Logger().Debug("Starting verification of challenge: %v", id)
if retry, err := ace._verifyChallenge(sc, id); err != nil {
if retry, retryAfter, err := ace._verifyChallenge(sc, id); err != nil {
// Because verification of this challenge failed, we need to retry
// it in the future. Log the error and re-add the item to the queue
// to try again later.
@ -229,7 +292,10 @@ func (ace *ACMEChallengeEngine) VerifyChallenge(runnerSc *storageContext, id str
if retry {
ace.ValidationLock.Lock()
defer ace.ValidationLock.Unlock()
ace.Validations.PushBack(id)
ace.Validations.PushBack(&ChallengeQueueEntry{
Identifier: id,
RetryAfter: retryAfter,
})
// Let the validator know there's a pending challenge.
select {
@ -249,12 +315,12 @@ func (ace *ACMEChallengeEngine) VerifyChallenge(runnerSc *storageContext, id str
finished <- false
}
func (ace *ACMEChallengeEngine) _verifyChallenge(sc *storageContext, id string) (bool, error) {
func (ace *ACMEChallengeEngine) _verifyChallenge(sc *storageContext, id string) (bool, time.Time, error) {
now := time.Now()
path := acmeValidationPrefix + id
challengeEntry, err := sc.Storage.Get(sc.Context, path)
if err != nil {
return true, fmt.Errorf("error loading challenge %v: %w", id, err)
return true, now, fmt.Errorf("error loading challenge %v: %w", id, err)
}
if challengeEntry == nil {
@ -267,18 +333,17 @@ func (ace *ACMEChallengeEngine) _verifyChallenge(sc *storageContext, id string)
var cv *ChallengeValidation
if err := challengeEntry.DecodeJSON(&cv); err != nil {
return true, fmt.Errorf("error decoding challenge %v: %w", id, err)
return true, now, fmt.Errorf("error decoding challenge %v: %w", id, err)
}
if now.Before(cv.RetryAfter) {
time.Sleep(50 * time.Millisecond)
return true, fmt.Errorf("retrying challenge %v too soon", id)
return true, cv.RetryAfter, fmt.Errorf("retrying challenge %v too soon", id)
}
authzPath := getAuthorizationPath(cv.Account, cv.Authorization)
authz, err := loadAuthorizationAtPath(sc, authzPath)
if err != nil {
return true, fmt.Errorf("error loading authorization %v/%v for challenge %v: %w", cv.Account, cv.Authorization, id, err)
return true, now, fmt.Errorf("error loading authorization %v/%v for challenge %v: %w", cv.Account, cv.Authorization, id, err)
}
if authz.Status != ACMEAuthorizationPending {
@ -324,12 +389,24 @@ func (ace *ACMEChallengeEngine) _verifyChallenge(sc *storageContext, id string)
err = fmt.Errorf("error validating http-01 challenge %v: %w", id, err)
return ace._verifyChallengeRetry(sc, cv, authz, err, id)
}
case ACMEDNSChallenge:
if authz.Identifier.Type != ACMEDNSIdentifier {
err = fmt.Errorf("unsupported identifier type for authorization %v/%v in challenge %v: %v", cv.Account, cv.Authorization, id, authz.Identifier.Type)
return ace._verifyChallengeCleanup(sc, err, id)
}
valid, err = ValidateDNS01Challenge(authz.Identifier.Value, cv.Token, cv.Thumbprint)
if err != nil {
err = fmt.Errorf("error validating dns-01 challenge %v: %w", id, err)
return ace._verifyChallengeRetry(sc, cv, authz, err, id)
}
default:
err = fmt.Errorf("unsupported ACME challenge type %v for challenge %v", cv.ChallengeType, id)
return ace._verifyChallengeCleanup(sc, err, id)
}
if !valid {
err = fmt.Errorf("challenge failed with no additional information")
return ace._verifyChallengeRetry(sc, cv, authz, err, id)
}
@ -349,7 +426,7 @@ func (ace *ACMEChallengeEngine) _verifyChallenge(sc *storageContext, id string)
return ace._verifyChallengeCleanup(sc, nil, id)
}
func (ace *ACMEChallengeEngine) _verifyChallengeRetry(sc *storageContext, cv *ChallengeValidation, authz *ACMEAuthorization, err error, id string) (bool, error) {
func (ace *ACMEChallengeEngine) _verifyChallengeRetry(sc *storageContext, cv *ChallengeValidation, authz *ACMEAuthorization, err error, id string) (bool, time.Time, error) {
now := time.Now()
path := acmeValidationPrefix + id
@ -367,29 +444,31 @@ func (ace *ACMEChallengeEngine) _verifyChallengeRetry(sc *storageContext, cv *Ch
json, jsonErr := logical.StorageEntryJSON(path, cv)
if jsonErr != nil {
return true, fmt.Errorf("error persisting updated challenge validation queue entry (error prior to retry, if any: %v): %w", err, jsonErr)
return true, now, fmt.Errorf("error persisting updated challenge validation queue entry (error prior to retry, if any: %v): %w", err, jsonErr)
}
if putErr := sc.Storage.Put(sc.Context, json); putErr != nil {
return true, fmt.Errorf("error writing updated challenge validation entry (error prior to retry, if any: %v): %w", err, putErr)
return true, now, fmt.Errorf("error writing updated challenge validation entry (error prior to retry, if any: %v): %w", err, putErr)
}
if err != nil {
err = fmt.Errorf("retrying validation: %w", err)
}
return true, err
return true, cv.RetryAfter, err
}
func (ace *ACMEChallengeEngine) _verifyChallengeCleanup(sc *storageContext, err error, id string) (bool, error) {
func (ace *ACMEChallengeEngine) _verifyChallengeCleanup(sc *storageContext, err error, id string) (bool, time.Time, error) {
now := time.Now()
// Remove our ChallengeValidation entry only.
if deleteErr := sc.Storage.Delete(sc.Context, acmeValidationPrefix+id); deleteErr != nil {
return true, fmt.Errorf("error deleting challenge %v (error prior to cleanup, if any: %v): %w", id, err, deleteErr)
return true, now.Add(-1 * time.Second), fmt.Errorf("error deleting challenge %v (error prior to cleanup, if any: %v): %w", id, err, deleteErr)
}
if err != nil {
err = fmt.Errorf("removing challenge validation attempt and not retrying %v; previous error: %w", id, err)
}
return false, err
return false, now, err
}

View File

@ -1,6 +1,8 @@
package pki
import (
"crypto/sha256"
"encoding/base64"
"fmt"
"io"
"net"
@ -28,6 +30,23 @@ func ValidateKeyAuthorization(keyAuthz string, token string, thumbprint string)
return true, nil
}
// ValidateSHA256KeyAuthorization validates that the given keyAuthz from a
// challenge matches our expectation, returning (true, nil) if so, or
// (false, err) if not.
//
// This is for use with DNS challenges, which require
func ValidateSHA256KeyAuthorization(keyAuthz string, token string, thumbprint string) (bool, error) {
authzContents := token + "." + thumbprint
checksum := sha256.Sum256([]byte(authzContents))
expectedAuthz := base64.RawURLEncoding.EncodeToString(checksum[:])
if keyAuthz != expectedAuthz {
return false, fmt.Errorf("sha256 key authorization was invalid")
}
return true, nil
}
// Validates a given ACME http-01 challenge against the specified domain,
// per RFC 8555.
//
@ -109,3 +128,30 @@ func ValidateHTTP01Challenge(domain string, token string, thumbprint string) (bo
// the token because we're bounded by a reasonable amount of length.
return ValidateKeyAuthorization(keyAuthz, token, thumbprint)
}
func ValidateDNS01Challenge(domain string, token string, thumbprint string) (bool, error) {
// Here, domain is the value from the post-wildcard-processed identifier.
// Per RFC 8555, no difference in validation occurs if a wildcard entry
// is requested or if a non-wildcard entry is requested.
//
// XXX: In this case the DNS server is operator controlled and is assumed
// to be less malicious so the default resolver is used. In the future,
// we'll want to use net.Resolver for two reasons:
//
// 1. To control the actual resolver via ACME configuration,
// 2. To use a context to set stricter timeout limits.
name := "_acme-challenge." + domain
results, err := net.LookupTXT(name)
if err != nil {
return false, fmt.Errorf("dns-01: failed to lookup TXT records for domain (%v): %w", name, err)
}
for _, keyAuthz := range results {
ok, _ := ValidateSHA256KeyAuthorization(keyAuthz, token, thumbprint)
if ok {
return true, nil
}
}
return false, fmt.Errorf("dns-01: challenge failed against %v records", len(results))
}