open-nomad/nomad/encrypter.go
Tim Gross dd3a07302e
keyring: update handle to state inside replication loop (#15227)
* keyring: update handle to state inside replication loop

When keyring replication starts, we take a handle to the state store. But
whenever a snapshot is restored, this handle is invalidated and no longer points
to a state store that is receiving new keys. This leaks a bunch of memory too!

In addition to operator-initiated restores, when fresh servers are added to
existing clusters with large-enough state, the keyring replication can get
started quickly enough that it's running before the snapshot from the existing
clusters have been restored.

Fix this by updating the handle to the state store on each pass.
2022-11-17 08:40:12 -05:00

543 lines
15 KiB
Go

package nomad
import (
"context"
"crypto/aes"
"crypto/cipher"
"crypto/ed25519"
"encoding/json"
"fmt"
"io/fs"
"os"
"path/filepath"
"strings"
"sync"
"time"
jwt "github.com/golang-jwt/jwt/v4"
log "github.com/hashicorp/go-hclog"
kms "github.com/hashicorp/go-kms-wrapping/v2"
"github.com/hashicorp/go-kms-wrapping/v2/aead"
"golang.org/x/time/rate"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/helper/crypto"
"github.com/hashicorp/nomad/nomad/structs"
)
const nomadKeystoreExtension = ".nks.json"
// Encrypter is the keyring for encrypting variables and signing workload
// identities.
type Encrypter struct {
srv *Server
keystorePath string
keyring map[string]*keyset
lock sync.RWMutex
}
type keyset struct {
rootKey *structs.RootKey
cipher cipher.AEAD
privateKey ed25519.PrivateKey
}
// NewEncrypter loads or creates a new local keystore and returns an
// encryption keyring with the keys it finds.
func NewEncrypter(srv *Server, keystorePath string) (*Encrypter, error) {
encrypter := &Encrypter{
srv: srv,
keystorePath: keystorePath,
keyring: make(map[string]*keyset),
}
err := encrypter.loadKeystore()
if err != nil {
return nil, err
}
return encrypter, nil
}
func (e *Encrypter) loadKeystore() error {
if err := os.MkdirAll(e.keystorePath, 0o700); err != nil {
return err
}
return filepath.Walk(e.keystorePath, func(path string, info fs.FileInfo, err error) error {
if err != nil {
return fmt.Errorf("could not read path %s from keystore: %v", path, err)
}
// skip over subdirectories and non-key files; they shouldn't
// be here but there's no reason to fail startup for it if the
// administrator has left something there
if path != e.keystorePath && info.IsDir() {
return filepath.SkipDir
}
if !strings.HasSuffix(path, nomadKeystoreExtension) {
return nil
}
id := strings.TrimSuffix(filepath.Base(path), nomadKeystoreExtension)
if !helper.IsUUID(id) {
return nil
}
key, err := e.loadKeyFromStore(path)
if err != nil {
return fmt.Errorf("could not load key file %s from keystore: %v", path, err)
}
if key.Meta.KeyID != id {
return fmt.Errorf("root key ID %s must match key file %s", key.Meta.KeyID, path)
}
err = e.AddKey(key)
if err != nil {
return fmt.Errorf("could not add key file %s to keystore: %v", path, err)
}
return nil
})
}
// Encrypt encrypts the clear data with the cipher for the current
// root key, and returns the cipher text (including the nonce), and
// the key ID used to encrypt it
func (e *Encrypter) Encrypt(cleartext []byte) ([]byte, string, error) {
keyset, err := e.activeKeySet()
if err != nil {
return nil, "", err
}
nonce, err := crypto.Bytes(keyset.cipher.NonceSize())
if err != nil {
return nil, "", fmt.Errorf("failed to generate key wrapper nonce: %v", err)
}
keyID := keyset.rootKey.Meta.KeyID
additional := []byte(keyID) // include the keyID in the signature inputs
// we use the nonce as the dst buffer so that the ciphertext is
// appended to that buffer and we always keep the nonce and
// ciphertext together, and so that we're not tempted to reuse
// the cleartext buffer which the caller still owns
ciphertext := keyset.cipher.Seal(nonce, nonce, cleartext, additional)
return ciphertext, keyID, nil
}
// Decrypt takes an encrypted buffer and then root key ID. It extracts
// the nonce, decrypts the content, and returns the cleartext data.
func (e *Encrypter) Decrypt(ciphertext []byte, keyID string) ([]byte, error) {
e.lock.RLock()
defer e.lock.RUnlock()
keyset, err := e.keysetByIDLocked(keyID)
if err != nil {
return nil, err
}
nonceSize := keyset.cipher.NonceSize()
nonce := ciphertext[:nonceSize] // nonce was stored alongside ciphertext
additional := []byte(keyID) // keyID was included in the signature inputs
return keyset.cipher.Open(nil, nonce, ciphertext[nonceSize:], additional)
}
// keyIDHeader is the JWT header for the Nomad Key ID used to sign the
// claim. This name matches the common industry practice for this
// header name.
const keyIDHeader = "kid"
// SignClaims signs the identity claim for the task and returns an encoded JWT
// (including both the claim and its signature), the key ID of the key used to
// sign it, and any error.
func (e *Encrypter) SignClaims(claim *structs.IdentityClaims) (string, string, error) {
// If a key is rotated immediately following a leader election, plans that
// are in-flight may get signed before the new leader has the key. Allow for
// a short timeout-and-retry to avoid rejecting plans
keyset, err := e.activeKeySet()
if err != nil {
ctx, cancel := context.WithTimeout(e.srv.shutdownCtx, 5*time.Second)
defer cancel()
for {
select {
case <-ctx.Done():
return "", "", err
default:
time.Sleep(50 * time.Millisecond)
keyset, err = e.activeKeySet()
if keyset != nil {
break
}
}
}
}
token := jwt.NewWithClaims(&jwt.SigningMethodEd25519{}, claim)
token.Header[keyIDHeader] = keyset.rootKey.Meta.KeyID
tokenString, err := token.SignedString(keyset.privateKey)
if err != nil {
return "", "", err
}
return tokenString, keyset.rootKey.Meta.KeyID, nil
}
// VerifyClaim accepts a previously-signed encoded claim and validates
// it before returning the claim
func (e *Encrypter) VerifyClaim(tokenString string) (*structs.IdentityClaims, error) {
token, err := jwt.ParseWithClaims(tokenString, &structs.IdentityClaims{}, func(token *jwt.Token) (interface{}, error) {
if _, ok := token.Method.(*jwt.SigningMethodEd25519); !ok {
return nil, fmt.Errorf("unexpected signing method: %v", token.Method.Alg())
}
raw := token.Header[keyIDHeader]
if raw == nil {
return nil, fmt.Errorf("missing key ID header")
}
keyID := raw.(string)
e.lock.RLock()
defer e.lock.RUnlock()
keyset, err := e.keysetByIDLocked(keyID)
if err != nil {
return nil, err
}
return keyset.privateKey.Public(), nil
})
if err != nil {
return nil, fmt.Errorf("failed to verify token: %v", err)
}
claims, ok := token.Claims.(*structs.IdentityClaims)
if !ok || !token.Valid {
return nil, fmt.Errorf("failed to verify token: invalid token")
}
return claims, nil
}
// AddKey stores the key in the keystore and creates a new cipher for it.
func (e *Encrypter) AddKey(rootKey *structs.RootKey) error {
// note: we don't lock the keyring here but inside addCipher
// instead, so that we're not holding the lock while performing
// local disk writes
if err := e.addCipher(rootKey); err != nil {
return err
}
if err := e.saveKeyToStore(rootKey); err != nil {
return err
}
return nil
}
// addCipher stores the key in the keyring and creates a new cipher for it.
func (e *Encrypter) addCipher(rootKey *structs.RootKey) error {
if rootKey == nil || rootKey.Meta == nil {
return fmt.Errorf("missing metadata")
}
var aead cipher.AEAD
switch rootKey.Meta.Algorithm {
case structs.EncryptionAlgorithmAES256GCM:
block, err := aes.NewCipher(rootKey.Key)
if err != nil {
return fmt.Errorf("could not create cipher: %v", err)
}
aead, err = cipher.NewGCM(block)
if err != nil {
return fmt.Errorf("could not create cipher: %v", err)
}
default:
return fmt.Errorf("invalid algorithm %s", rootKey.Meta.Algorithm)
}
privateKey := ed25519.NewKeyFromSeed(rootKey.Key)
e.lock.Lock()
defer e.lock.Unlock()
e.keyring[rootKey.Meta.KeyID] = &keyset{
rootKey: rootKey,
cipher: aead,
privateKey: privateKey,
}
return nil
}
// GetKey retrieves the key material by ID from the keyring
func (e *Encrypter) GetKey(keyID string) ([]byte, error) {
e.lock.RLock()
defer e.lock.RUnlock()
keyset, err := e.keysetByIDLocked(keyID)
if err != nil {
return nil, err
}
return keyset.rootKey.Key, nil
}
// activeKeySetLocked returns the keyset that belongs to the key marked as
// active in the state store (so that it's consistent with raft). The
// called must read-lock the keyring
func (e *Encrypter) activeKeySet() (*keyset, error) {
store := e.srv.fsm.State()
keyMeta, err := store.GetActiveRootKeyMeta(nil)
if err != nil {
return nil, err
}
if keyMeta == nil {
return nil, fmt.Errorf("keyring has not been initialized yet")
}
e.lock.RLock()
defer e.lock.RUnlock()
return e.keysetByIDLocked(keyMeta.KeyID)
}
// keysetByIDLocked returns the keyset for the specified keyID. The
// caller must read-lock the keyring
func (e *Encrypter) keysetByIDLocked(keyID string) (*keyset, error) {
keyset, ok := e.keyring[keyID]
if !ok {
return nil, fmt.Errorf("no such key %q in keyring", keyID)
}
return keyset, nil
}
// RemoveKey removes a key by ID from the keyring
func (e *Encrypter) RemoveKey(keyID string) error {
e.lock.Lock()
defer e.lock.Unlock()
delete(e.keyring, keyID)
return nil
}
// saveKeyToStore serializes a root key to the on-disk keystore.
func (e *Encrypter) saveKeyToStore(rootKey *structs.RootKey) error {
kek, err := crypto.Bytes(32)
if err != nil {
return fmt.Errorf("failed to generate key wrapper key: %v", err)
}
wrapper, err := e.newKMSWrapper(rootKey.Meta.KeyID, kek)
if err != nil {
return fmt.Errorf("failed to create encryption wrapper: %v", err)
}
blob, err := wrapper.Encrypt(e.srv.shutdownCtx, rootKey.Key)
if err != nil {
return fmt.Errorf("failed to encrypt root key: %v", err)
}
kekWrapper := &structs.KeyEncryptionKeyWrapper{
Meta: rootKey.Meta,
EncryptedDataEncryptionKey: blob.Ciphertext,
KeyEncryptionKey: kek,
}
buf, err := json.Marshal(kekWrapper)
if err != nil {
return err
}
path := filepath.Join(e.keystorePath, rootKey.Meta.KeyID+nomadKeystoreExtension)
err = os.WriteFile(path, buf, 0o600)
if err != nil {
return err
}
return nil
}
// loadKeyFromStore deserializes a root key from disk.
func (e *Encrypter) loadKeyFromStore(path string) (*structs.RootKey, error) {
raw, err := os.ReadFile(path)
if err != nil {
return nil, err
}
kekWrapper := &structs.KeyEncryptionKeyWrapper{}
if err := json.Unmarshal(raw, kekWrapper); err != nil {
return nil, err
}
meta := kekWrapper.Meta
if err = meta.Validate(); err != nil {
return nil, err
}
// the errors that bubble up from this library can be a bit opaque, so make
// sure we wrap them with as much context as possible
wrapper, err := e.newKMSWrapper(meta.KeyID, kekWrapper.KeyEncryptionKey)
if err != nil {
return nil, fmt.Errorf("unable to create key wrapper cipher: %v", err)
}
key, err := wrapper.Decrypt(e.srv.shutdownCtx, &kms.BlobInfo{
Ciphertext: kekWrapper.EncryptedDataEncryptionKey,
})
if err != nil {
return nil, fmt.Errorf("unable to decrypt wrapped root key: %v", err)
}
return &structs.RootKey{
Meta: meta,
Key: key,
}, nil
}
// newKMSWrapper returns a go-kms-wrapping interface the caller can use to
// encrypt the RootKey with a key encryption key (KEK). This is a bit of
// security theatre for local on-disk key material, but gives us a shim for
// external KMS providers in the future.
func (e *Encrypter) newKMSWrapper(keyID string, kek []byte) (kms.Wrapper, error) {
wrapper := aead.NewWrapper()
wrapper.SetConfig(context.Background(),
aead.WithAeadType(kms.AeadTypeAesGcm),
aead.WithHashType(kms.HashTypeSha256),
kms.WithKeyId(keyID),
)
err := wrapper.SetAesGcmKeyBytes(kek)
if err != nil {
return nil, err
}
return wrapper, nil
}
type KeyringReplicator struct {
srv *Server
encrypter *Encrypter
logger log.Logger
stopFn context.CancelFunc
}
func NewKeyringReplicator(srv *Server, e *Encrypter) *KeyringReplicator {
ctx, cancel := context.WithCancel(context.Background())
repl := &KeyringReplicator{
srv: srv,
encrypter: e,
logger: srv.logger.Named("keyring.replicator"),
stopFn: cancel,
}
go repl.run(ctx)
return repl
}
// stop is provided for testing
func (krr *KeyringReplicator) stop() {
krr.stopFn()
}
const keyringReplicationRate = 5
func (krr *KeyringReplicator) run(ctx context.Context) {
krr.logger.Debug("starting encryption key replication")
defer krr.logger.Debug("exiting key replication")
limiter := rate.NewLimiter(keyringReplicationRate, keyringReplicationRate)
for {
select {
case <-krr.srv.shutdownCtx.Done():
return
case <-ctx.Done():
return
default:
err := limiter.Wait(ctx)
if err != nil {
continue // rate limit exceeded
}
store := krr.srv.fsm.State()
iter, err := store.RootKeyMetas(nil)
if err != nil {
krr.logger.Error("failed to fetch keyring", "error", err)
continue
}
for {
raw := iter.Next()
if raw == nil {
break
}
keyMeta := raw.(*structs.RootKeyMeta)
if key, err := krr.encrypter.GetKey(keyMeta.KeyID); err == nil && len(key) > 0 {
// the key material is immutable so if we've already got it
// we can move on to the next key
continue
}
err := krr.replicateKey(ctx, keyMeta)
if err != nil {
// don't break the loop on an error, as we want to make sure
// we've replicated any keys we can. the rate limiter will
// prevent this case from sending excessive RPCs
krr.logger.Error(err.Error(), "key", keyMeta.KeyID)
}
}
}
}
}
// replicateKey replicates a single key from peer servers that was present in
// the state store but missing from the keyring. Returns an error only if no
// peers have this key.
func (krr *KeyringReplicator) replicateKey(ctx context.Context, keyMeta *structs.RootKeyMeta) error {
keyID := keyMeta.KeyID
krr.logger.Debug("replicating new key", "id", keyID)
getReq := &structs.KeyringGetRootKeyRequest{
KeyID: keyID,
QueryOptions: structs.QueryOptions{
Region: krr.srv.config.Region,
MinQueryIndex: keyMeta.ModifyIndex - 1,
},
}
getResp := &structs.KeyringGetRootKeyResponse{}
err := krr.srv.RPC("Keyring.Get", getReq, getResp)
if err != nil || getResp.Key == nil {
// Key replication needs to tolerate leadership flapping. If a key is
// rotated during a leadership transition, it's possible that the new
// leader has not yet replicated the key from the old leader before the
// transition. Ask all the other servers if they have it.
krr.logger.Warn("failed to fetch key from current leader, trying peers",
"key", keyID, "error", err)
getReq.AllowStale = true
for _, peer := range krr.getAllPeers() {
err = krr.srv.forwardServer(peer, "Keyring.Get", getReq, getResp)
if err == nil && getResp.Key != nil {
break
}
}
if getResp.Key == nil {
krr.logger.Error("failed to fetch key from any peer",
"key", keyID, "error", err)
return fmt.Errorf("failed to fetch key from any peer: %v", err)
}
}
err = krr.encrypter.AddKey(getResp.Key)
if err != nil {
return fmt.Errorf("failed to add key to keyring: %v", err)
}
krr.logger.Debug("added key", "key", keyID)
return nil
}
// TODO: move this method into Server?
func (krr *KeyringReplicator) getAllPeers() []*serverParts {
krr.srv.peerLock.RLock()
defer krr.srv.peerLock.RUnlock()
peers := make([]*serverParts, 0, len(krr.srv.localPeers))
for _, peer := range krr.srv.localPeers {
peers = append(peers, peer.Copy())
}
return peers
}