903b5baaa4
When replication of a single key fails, the replication loop breaks early and therefore keys that fall later in the sorting order will never get replicated. This is particularly a problem for clusters impacted by the bug that caused #14981 and that were later upgraded; the keys that were never replicated can now never be replicated, and so we need to handle them safely. Included in the replication fix: * Refactor the replication loop so that each key replicated in a function call that returns an error, to make the workflow more clear and reduce nesting. Log the error and continue. * Improve stability of keyring replication tests. We no longer block leadership on initializing the keyring, so there's a race condition in the keyring tests where we can test for the existence of the root key before the keyring has been initialize. Change this to an "eventually" test. But these fixes aren't enough to fix #14981 because they'll end up seeing an error once a second complaining about the missing key, so we also need to fix keyring GC so the keys can be removed from the state store. Now we'll store the key ID used to sign a workload identity in the Allocation, and we'll index the Allocation table on that so we can track whether any live Allocation was signed with a particular key ID.
560 lines
15 KiB
Go
560 lines
15 KiB
Go
package nomad
|
|
|
|
import (
|
|
"context"
|
|
"crypto/aes"
|
|
"crypto/cipher"
|
|
"crypto/ed25519"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io/fs"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
jwt "github.com/golang-jwt/jwt/v4"
|
|
log "github.com/hashicorp/go-hclog"
|
|
kms "github.com/hashicorp/go-kms-wrapping/v2"
|
|
"github.com/hashicorp/go-kms-wrapping/v2/aead"
|
|
"golang.org/x/time/rate"
|
|
|
|
"github.com/hashicorp/nomad/helper"
|
|
"github.com/hashicorp/nomad/helper/crypto"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
)
|
|
|
|
const nomadKeystoreExtension = ".nks.json"
|
|
|
|
// Encrypter is the keyring for encrypting variables and signing workload
|
|
// identities.
|
|
type Encrypter struct {
|
|
srv *Server
|
|
keystorePath string
|
|
|
|
keyring map[string]*keyset
|
|
lock sync.RWMutex
|
|
}
|
|
|
|
type keyset struct {
|
|
rootKey *structs.RootKey
|
|
cipher cipher.AEAD
|
|
privateKey ed25519.PrivateKey
|
|
}
|
|
|
|
// NewEncrypter loads or creates a new local keystore and returns an
|
|
// encryption keyring with the keys it finds.
|
|
func NewEncrypter(srv *Server, keystorePath string) (*Encrypter, error) {
|
|
|
|
encrypter := &Encrypter{
|
|
srv: srv,
|
|
keystorePath: keystorePath,
|
|
keyring: make(map[string]*keyset),
|
|
}
|
|
|
|
err := encrypter.loadKeystore()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return encrypter, nil
|
|
}
|
|
|
|
func (e *Encrypter) loadKeystore() error {
|
|
|
|
if err := os.MkdirAll(e.keystorePath, 0o700); err != nil {
|
|
return err
|
|
}
|
|
|
|
return filepath.Walk(e.keystorePath, func(path string, info fs.FileInfo, err error) error {
|
|
if err != nil {
|
|
return fmt.Errorf("could not read path %s from keystore: %v", path, err)
|
|
}
|
|
|
|
// skip over subdirectories and non-key files; they shouldn't
|
|
// be here but there's no reason to fail startup for it if the
|
|
// administrator has left something there
|
|
if path != e.keystorePath && info.IsDir() {
|
|
return filepath.SkipDir
|
|
}
|
|
if !strings.HasSuffix(path, nomadKeystoreExtension) {
|
|
return nil
|
|
}
|
|
id := strings.TrimSuffix(filepath.Base(path), nomadKeystoreExtension)
|
|
if !helper.IsUUID(id) {
|
|
return nil
|
|
}
|
|
|
|
key, err := e.loadKeyFromStore(path)
|
|
if err != nil {
|
|
return fmt.Errorf("could not load key file %s from keystore: %v", path, err)
|
|
}
|
|
if key.Meta.KeyID != id {
|
|
return fmt.Errorf("root key ID %s must match key file %s", key.Meta.KeyID, path)
|
|
}
|
|
|
|
err = e.AddKey(key)
|
|
if err != nil {
|
|
return fmt.Errorf("could not add key file %s to keystore: %v", path, err)
|
|
}
|
|
return nil
|
|
})
|
|
}
|
|
|
|
// Encrypt encrypts the clear data with the cipher for the current
|
|
// root key, and returns the cipher text (including the nonce), and
|
|
// the key ID used to encrypt it
|
|
func (e *Encrypter) Encrypt(cleartext []byte) ([]byte, string, error) {
|
|
|
|
keyset, err := e.activeKeySet()
|
|
if err != nil {
|
|
return nil, "", err
|
|
}
|
|
|
|
nonce, err := crypto.Bytes(keyset.cipher.NonceSize())
|
|
if err != nil {
|
|
return nil, "", fmt.Errorf("failed to generate key wrapper nonce: %v", err)
|
|
}
|
|
|
|
keyID := keyset.rootKey.Meta.KeyID
|
|
additional := []byte(keyID) // include the keyID in the signature inputs
|
|
|
|
// we use the nonce as the dst buffer so that the ciphertext is
|
|
// appended to that buffer and we always keep the nonce and
|
|
// ciphertext together, and so that we're not tempted to reuse
|
|
// the cleartext buffer which the caller still owns
|
|
ciphertext := keyset.cipher.Seal(nonce, nonce, cleartext, additional)
|
|
return ciphertext, keyID, nil
|
|
}
|
|
|
|
// Decrypt takes an encrypted buffer and then root key ID. It extracts
|
|
// the nonce, decrypts the content, and returns the cleartext data.
|
|
func (e *Encrypter) Decrypt(ciphertext []byte, keyID string) ([]byte, error) {
|
|
e.lock.RLock()
|
|
defer e.lock.RUnlock()
|
|
|
|
keyset, err := e.keysetByIDLocked(keyID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
nonceSize := keyset.cipher.NonceSize()
|
|
nonce := ciphertext[:nonceSize] // nonce was stored alongside ciphertext
|
|
additional := []byte(keyID) // keyID was included in the signature inputs
|
|
|
|
return keyset.cipher.Open(nil, nonce, ciphertext[nonceSize:], additional)
|
|
}
|
|
|
|
// keyIDHeader is the JWT header for the Nomad Key ID used to sign the
|
|
// claim. This name matches the common industry practice for this
|
|
// header name.
|
|
const keyIDHeader = "kid"
|
|
|
|
// SignClaims signs the identity claim for the task and returns an encoded JWT
|
|
// (including both the claim and its signature), the key ID of the key used to
|
|
// sign it, and any error.
|
|
func (e *Encrypter) SignClaims(claim *structs.IdentityClaims) (string, string, error) {
|
|
|
|
// If a key is rotated immediately following a leader election, plans that
|
|
// are in-flight may get signed before the new leader has the key. Allow for
|
|
// a short timeout-and-retry to avoid rejecting plans
|
|
keyset, err := e.activeKeySet()
|
|
if err != nil {
|
|
ctx, cancel := context.WithTimeout(e.srv.shutdownCtx, 5*time.Second)
|
|
defer cancel()
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return "", "", err
|
|
default:
|
|
time.Sleep(50 * time.Millisecond)
|
|
keyset, err = e.activeKeySet()
|
|
if keyset != nil {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
token := jwt.NewWithClaims(&jwt.SigningMethodEd25519{}, claim)
|
|
token.Header[keyIDHeader] = keyset.rootKey.Meta.KeyID
|
|
|
|
tokenString, err := token.SignedString(keyset.privateKey)
|
|
if err != nil {
|
|
return "", "", err
|
|
}
|
|
|
|
return tokenString, keyset.rootKey.Meta.KeyID, nil
|
|
}
|
|
|
|
// VerifyClaim accepts a previously-signed encoded claim and validates
|
|
// it before returning the claim
|
|
func (e *Encrypter) VerifyClaim(tokenString string) (*structs.IdentityClaims, error) {
|
|
|
|
token, err := jwt.ParseWithClaims(tokenString, &structs.IdentityClaims{}, func(token *jwt.Token) (interface{}, error) {
|
|
if _, ok := token.Method.(*jwt.SigningMethodEd25519); !ok {
|
|
return nil, fmt.Errorf("unexpected signing method: %v", token.Method.Alg())
|
|
}
|
|
raw := token.Header[keyIDHeader]
|
|
if raw == nil {
|
|
return nil, fmt.Errorf("missing key ID header")
|
|
}
|
|
keyID := raw.(string)
|
|
|
|
e.lock.RLock()
|
|
defer e.lock.RUnlock()
|
|
keyset, err := e.keysetByIDLocked(keyID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return keyset.privateKey.Public(), nil
|
|
})
|
|
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to verify token: %v", err)
|
|
}
|
|
|
|
claims, ok := token.Claims.(*structs.IdentityClaims)
|
|
if !ok || !token.Valid {
|
|
return nil, fmt.Errorf("failed to verify token: invalid token")
|
|
}
|
|
return claims, nil
|
|
}
|
|
|
|
// AddKey stores the key in the keystore and creates a new cipher for it.
|
|
func (e *Encrypter) AddKey(rootKey *structs.RootKey) error {
|
|
|
|
// note: we don't lock the keyring here but inside addCipher
|
|
// instead, so that we're not holding the lock while performing
|
|
// local disk writes
|
|
if err := e.addCipher(rootKey); err != nil {
|
|
return err
|
|
}
|
|
if err := e.saveKeyToStore(rootKey); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// addCipher stores the key in the keyring and creates a new cipher for it.
|
|
func (e *Encrypter) addCipher(rootKey *structs.RootKey) error {
|
|
|
|
if rootKey == nil || rootKey.Meta == nil {
|
|
return fmt.Errorf("missing metadata")
|
|
}
|
|
var aead cipher.AEAD
|
|
|
|
switch rootKey.Meta.Algorithm {
|
|
case structs.EncryptionAlgorithmAES256GCM:
|
|
block, err := aes.NewCipher(rootKey.Key)
|
|
if err != nil {
|
|
return fmt.Errorf("could not create cipher: %v", err)
|
|
}
|
|
aead, err = cipher.NewGCM(block)
|
|
if err != nil {
|
|
return fmt.Errorf("could not create cipher: %v", err)
|
|
}
|
|
default:
|
|
return fmt.Errorf("invalid algorithm %s", rootKey.Meta.Algorithm)
|
|
}
|
|
|
|
privateKey := ed25519.NewKeyFromSeed(rootKey.Key)
|
|
|
|
e.lock.Lock()
|
|
defer e.lock.Unlock()
|
|
e.keyring[rootKey.Meta.KeyID] = &keyset{
|
|
rootKey: rootKey,
|
|
cipher: aead,
|
|
privateKey: privateKey,
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// GetKey retrieves the key material by ID from the keyring
|
|
func (e *Encrypter) GetKey(keyID string) ([]byte, error) {
|
|
e.lock.RLock()
|
|
defer e.lock.RUnlock()
|
|
|
|
keyset, err := e.keysetByIDLocked(keyID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return keyset.rootKey.Key, nil
|
|
}
|
|
|
|
// activeKeySetLocked returns the keyset that belongs to the key marked as
|
|
// active in the state store (so that it's consistent with raft). The
|
|
// called must read-lock the keyring
|
|
func (e *Encrypter) activeKeySet() (*keyset, error) {
|
|
store := e.srv.fsm.State()
|
|
keyMeta, err := store.GetActiveRootKeyMeta(nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if keyMeta == nil {
|
|
return nil, fmt.Errorf("keyring has not been initialized yet")
|
|
}
|
|
e.lock.RLock()
|
|
defer e.lock.RUnlock()
|
|
return e.keysetByIDLocked(keyMeta.KeyID)
|
|
}
|
|
|
|
// keysetByIDLocked returns the keyset for the specified keyID. The
|
|
// caller must read-lock the keyring
|
|
func (e *Encrypter) keysetByIDLocked(keyID string) (*keyset, error) {
|
|
keyset, ok := e.keyring[keyID]
|
|
if !ok {
|
|
return nil, fmt.Errorf("no such key %q in keyring", keyID)
|
|
}
|
|
return keyset, nil
|
|
}
|
|
|
|
// RemoveKey removes a key by ID from the keyring
|
|
func (e *Encrypter) RemoveKey(keyID string) error {
|
|
e.lock.Lock()
|
|
defer e.lock.Unlock()
|
|
delete(e.keyring, keyID)
|
|
return nil
|
|
}
|
|
|
|
// saveKeyToStore serializes a root key to the on-disk keystore.
|
|
func (e *Encrypter) saveKeyToStore(rootKey *structs.RootKey) error {
|
|
|
|
kek, err := crypto.Bytes(32)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to generate key wrapper key: %v", err)
|
|
}
|
|
wrapper, err := e.newKMSWrapper(rootKey.Meta.KeyID, kek)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create encryption wrapper: %v", err)
|
|
}
|
|
blob, err := wrapper.Encrypt(e.srv.shutdownCtx, rootKey.Key)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to encrypt root key: %v", err)
|
|
}
|
|
|
|
kekWrapper := &structs.KeyEncryptionKeyWrapper{
|
|
Meta: rootKey.Meta,
|
|
EncryptedDataEncryptionKey: blob.Ciphertext,
|
|
KeyEncryptionKey: kek,
|
|
}
|
|
|
|
buf, err := json.Marshal(kekWrapper)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
path := filepath.Join(e.keystorePath, rootKey.Meta.KeyID+nomadKeystoreExtension)
|
|
err = os.WriteFile(path, buf, 0o600)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// loadKeyFromStore deserializes a root key from disk.
|
|
func (e *Encrypter) loadKeyFromStore(path string) (*structs.RootKey, error) {
|
|
|
|
raw, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
kekWrapper := &structs.KeyEncryptionKeyWrapper{}
|
|
if err := json.Unmarshal(raw, kekWrapper); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
meta := kekWrapper.Meta
|
|
if err = meta.Validate(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// the errors that bubble up from this library can be a bit opaque, so make
|
|
// sure we wrap them with as much context as possible
|
|
wrapper, err := e.newKMSWrapper(meta.KeyID, kekWrapper.KeyEncryptionKey)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("unable to create key wrapper cipher: %v", err)
|
|
}
|
|
key, err := wrapper.Decrypt(e.srv.shutdownCtx, &kms.BlobInfo{
|
|
Ciphertext: kekWrapper.EncryptedDataEncryptionKey,
|
|
})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("unable to decrypt wrapped root key: %v", err)
|
|
}
|
|
|
|
return &structs.RootKey{
|
|
Meta: meta,
|
|
Key: key,
|
|
}, nil
|
|
}
|
|
|
|
// newKMSWrapper returns a go-kms-wrapping interface the caller can use to
|
|
// encrypt the RootKey with a key encryption key (KEK). This is a bit of
|
|
// security theatre for local on-disk key material, but gives us a shim for
|
|
// external KMS providers in the future.
|
|
func (e *Encrypter) newKMSWrapper(keyID string, kek []byte) (kms.Wrapper, error) {
|
|
wrapper := aead.NewWrapper()
|
|
wrapper.SetConfig(context.Background(),
|
|
aead.WithAeadType(kms.AeadTypeAesGcm),
|
|
aead.WithHashType(kms.HashTypeSha256),
|
|
kms.WithKeyId(keyID),
|
|
)
|
|
err := wrapper.SetAesGcmKeyBytes(kek)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return wrapper, nil
|
|
}
|
|
|
|
type KeyringReplicator struct {
|
|
srv *Server
|
|
encrypter *Encrypter
|
|
logger log.Logger
|
|
stopFn context.CancelFunc
|
|
}
|
|
|
|
func NewKeyringReplicator(srv *Server, e *Encrypter) *KeyringReplicator {
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
repl := &KeyringReplicator{
|
|
srv: srv,
|
|
encrypter: e,
|
|
logger: srv.logger.Named("keyring.replicator"),
|
|
stopFn: cancel,
|
|
}
|
|
go repl.run(ctx)
|
|
return repl
|
|
}
|
|
|
|
// stop is provided for testing
|
|
func (krr *KeyringReplicator) stop() {
|
|
krr.stopFn()
|
|
}
|
|
|
|
const keyringReplicationRate = 10
|
|
|
|
func (krr *KeyringReplicator) run(ctx context.Context) {
|
|
limiter := rate.NewLimiter(keyringReplicationRate, keyringReplicationRate)
|
|
krr.logger.Debug("starting encryption key replication")
|
|
defer krr.logger.Debug("exiting key replication")
|
|
|
|
retryErrTimer, stop := helper.NewSafeTimer(time.Second * 1)
|
|
defer stop()
|
|
|
|
START:
|
|
store := krr.srv.fsm.State()
|
|
|
|
for {
|
|
select {
|
|
case <-krr.srv.shutdownCtx.Done():
|
|
return
|
|
case <-ctx.Done():
|
|
return
|
|
default:
|
|
// Rate limit how often we attempt replication
|
|
err := limiter.Wait(ctx)
|
|
if err != nil {
|
|
goto ERR_WAIT // rate limit exceeded
|
|
}
|
|
|
|
ws := store.NewWatchSet()
|
|
iter, err := store.RootKeyMetas(ws)
|
|
if err != nil {
|
|
krr.logger.Error("failed to fetch keyring", "error", err)
|
|
goto ERR_WAIT
|
|
}
|
|
|
|
for {
|
|
raw := iter.Next()
|
|
if raw == nil {
|
|
break
|
|
}
|
|
|
|
keyMeta := raw.(*structs.RootKeyMeta)
|
|
if key, err := krr.encrypter.GetKey(keyMeta.KeyID); err == nil && len(key) > 0 {
|
|
// the key material is immutable so if we've already got it
|
|
// we can move on to the next key
|
|
continue
|
|
}
|
|
|
|
err := krr.replicateKey(ctx, keyMeta)
|
|
if err != nil {
|
|
// don't break the loop on an error, as we want to make sure
|
|
// we've replicated any keys we can. the rate limiter will
|
|
// prevent this case from sending excessive RPCs
|
|
krr.logger.Error(err.Error(), "key", keyMeta.KeyID)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
ERR_WAIT:
|
|
retryErrTimer.Reset(1 * time.Second)
|
|
|
|
select {
|
|
case <-retryErrTimer.C:
|
|
goto START
|
|
case <-ctx.Done():
|
|
return
|
|
}
|
|
|
|
}
|
|
|
|
// replicateKey replicates a single key from peer servers that was present in
|
|
// the state store but missing from the keyring. Returns an error only if no
|
|
// peers have this key.
|
|
func (krr *KeyringReplicator) replicateKey(ctx context.Context, keyMeta *structs.RootKeyMeta) error {
|
|
keyID := keyMeta.KeyID
|
|
krr.logger.Debug("replicating new key", "id", keyID)
|
|
|
|
getReq := &structs.KeyringGetRootKeyRequest{
|
|
KeyID: keyID,
|
|
QueryOptions: structs.QueryOptions{
|
|
Region: krr.srv.config.Region,
|
|
MinQueryIndex: keyMeta.ModifyIndex - 1,
|
|
},
|
|
}
|
|
getResp := &structs.KeyringGetRootKeyResponse{}
|
|
err := krr.srv.RPC("Keyring.Get", getReq, getResp)
|
|
|
|
if err != nil || getResp.Key == nil {
|
|
// Key replication needs to tolerate leadership flapping. If a key is
|
|
// rotated during a leadership transition, it's possible that the new
|
|
// leader has not yet replicated the key from the old leader before the
|
|
// transition. Ask all the other servers if they have it.
|
|
krr.logger.Warn("failed to fetch key from current leader, trying peers",
|
|
"key", keyID, "error", err)
|
|
getReq.AllowStale = true
|
|
for _, peer := range krr.getAllPeers() {
|
|
err = krr.srv.forwardServer(peer, "Keyring.Get", getReq, getResp)
|
|
if err == nil && getResp.Key != nil {
|
|
break
|
|
}
|
|
}
|
|
if getResp.Key == nil {
|
|
krr.logger.Error("failed to fetch key from any peer",
|
|
"key", keyID, "error", err)
|
|
return fmt.Errorf("failed to fetch key from any peer: %v", err)
|
|
}
|
|
}
|
|
|
|
err = krr.encrypter.AddKey(getResp.Key)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to add key to keyring: %v", err)
|
|
}
|
|
|
|
krr.logger.Debug("added key", "key", keyID)
|
|
return nil
|
|
}
|
|
|
|
// TODO: move this method into Server?
|
|
func (krr *KeyringReplicator) getAllPeers() []*serverParts {
|
|
krr.srv.peerLock.RLock()
|
|
defer krr.srv.peerLock.RUnlock()
|
|
peers := make([]*serverParts, 0, len(krr.srv.localPeers))
|
|
for _, peer := range krr.srv.localPeers {
|
|
peers = append(peers, peer.Copy())
|
|
}
|
|
return peers
|
|
}
|