open-nomad/nomad/encrypter.go

package nomad

import (
	"context"
	"crypto/aes"
	"crypto/cipher"
	"crypto/ed25519"
	"encoding/json"
	"fmt"
	"io/fs"
	"os"
	"path/filepath"
	"strings"
	"sync"
	"time"

	jwt "github.com/golang-jwt/jwt/v4"
	log "github.com/hashicorp/go-hclog"
	kms "github.com/hashicorp/go-kms-wrapping/v2"
	"github.com/hashicorp/go-kms-wrapping/v2/aead"
	"golang.org/x/time/rate"

	"github.com/hashicorp/nomad/helper"
	"github.com/hashicorp/nomad/helper/crypto"
	"github.com/hashicorp/nomad/nomad/structs"
)

const nomadKeystoreExtension = ".nks.json"

// Encrypter is the keyring for encrypting variables and signing workload
// identities.
type Encrypter struct {
	srv          *Server
	keystorePath string

	keyring map[string]*keyset
	lock    sync.RWMutex
}

type keyset struct {
	rootKey    *structs.RootKey
	cipher     cipher.AEAD
	privateKey ed25519.PrivateKey
}

// NewEncrypter loads or creates a new local keystore and returns an
// encryption keyring with the keys it finds.
func NewEncrypter(srv *Server, keystorePath string) (*Encrypter, error) {

	encrypter := &Encrypter{
		srv:          srv,
		keystorePath: keystorePath,
		keyring:      make(map[string]*keyset),
	}

	err := encrypter.loadKeystore()
	if err != nil {
		return nil, err
	}
	return encrypter, nil
}

func (e *Encrypter) loadKeystore() error {

	if err := os.MkdirAll(e.keystorePath, 0o700); err != nil {
		return err
	}

	return filepath.Walk(e.keystorePath, func(path string, info fs.FileInfo, err error) error {
		if err != nil {
			return fmt.Errorf("could not read path %s from keystore: %v", path, err)
		}

		// skip over subdirectories and non-key files; they shouldn't
		// be here but there's no reason to fail startup for it if the
		// administrator has left something there
		if path != e.keystorePath && info.IsDir() {
			return filepath.SkipDir
		}
		if !strings.HasSuffix(path, nomadKeystoreExtension) {
			return nil
		}
		id := strings.TrimSuffix(filepath.Base(path), nomadKeystoreExtension)
		if !helper.IsUUID(id) {
			return nil
		}

		key, err := e.loadKeyFromStore(path)
		if err != nil {
			return fmt.Errorf("could not load key file %s from keystore: %v", path, err)
		}
		if key.Meta.KeyID != id {
			return fmt.Errorf("root key ID %s must match key file %s", key.Meta.KeyID, path)
		}

		err = e.AddKey(key)
		if err != nil {
			return fmt.Errorf("could not add key file %s to keystore: %v", path, err)
		}
		return nil
	})
}

// Encrypt encrypts the clear data with the cipher for the current
// root key, and returns the cipher text (including the nonce), and
// the key ID used to encrypt it
func (e *Encrypter) Encrypt(cleartext []byte) ([]byte, string, error) {

	keyset, err := e.activeKeySet()
	if err != nil {
		return nil, "", err
	}

	nonce, err := crypto.Bytes(keyset.cipher.NonceSize())
	if err != nil {
		return nil, "", fmt.Errorf("failed to generate key wrapper nonce: %v", err)
	}

	keyID := keyset.rootKey.Meta.KeyID
	additional := []byte(keyID) // include the keyID in the signature inputs

	// we use the nonce as the dst buffer so that the ciphertext is
	// appended to that buffer and we always keep the nonce and
	// ciphertext together, and so that we're not tempted to reuse
	// the cleartext buffer which the caller still owns
	ciphertext := keyset.cipher.Seal(nonce, nonce, cleartext, additional)
	return ciphertext, keyID, nil
}

// Decrypt takes an encrypted buffer and then root key ID. It extracts
// the nonce, decrypts the content, and returns the cleartext data.
func (e *Encrypter) Decrypt(ciphertext []byte, keyID string) ([]byte, error) {
	e.lock.RLock()
	defer e.lock.RUnlock()

	keyset, err := e.keysetByIDLocked(keyID)
	if err != nil {
		return nil, err
	}

	nonceSize := keyset.cipher.NonceSize()
	nonce := ciphertext[:nonceSize] // nonce was stored alongside ciphertext
	additional := []byte(keyID)     // keyID was included in the signature inputs

	return keyset.cipher.Open(nil, nonce, ciphertext[nonceSize:], additional)
}

// keyIDHeader is the JWT header for the Nomad Key ID used to sign the
// claim. This name matches the common industry practice for this
// header name.
const keyIDHeader = "kid"

// SignClaims signs the identity claim for the task and returns an encoded JWT
// (including both the claim and its signature), the key ID of the key used to
// sign it, and any error.
func (e *Encrypter) SignClaims(claim *structs.IdentityClaims) (string, string, error) {

	// If a key is rotated immediately following a leader election, plans that
	// are in-flight may get signed before the new leader has the key. Allow for
	// a short timeout-and-retry to avoid rejecting plans
	keyset, err := e.activeKeySet()
	if err != nil {
		ctx, cancel := context.WithTimeout(e.srv.shutdownCtx, 5*time.Second)
		defer cancel()
		for {
			select {
			case <-ctx.Done():
				return "", "", err
			default:
				time.Sleep(50 * time.Millisecond)
				keyset, err = e.activeKeySet()
				if keyset != nil {
					break
				}
			}
		}
	}

	token := jwt.NewWithClaims(&jwt.SigningMethodEd25519{}, claim)
	token.Header[keyIDHeader] = keyset.rootKey.Meta.KeyID

	tokenString, err := token.SignedString(keyset.privateKey)
	if err != nil {
		return "", "", err
	}

	return tokenString, keyset.rootKey.Meta.KeyID, nil
}

// VerifyClaim accepts a previously-signed encoded claim and validates
// it before returning the claim
func (e *Encrypter) VerifyClaim(tokenString string) (*structs.IdentityClaims, error) {

	token, err := jwt.ParseWithClaims(tokenString, &structs.IdentityClaims{}, func(token *jwt.Token) (interface{}, error) {
		if _, ok := token.Method.(*jwt.SigningMethodEd25519); !ok {
			return nil, fmt.Errorf("unexpected signing method: %v", token.Method.Alg())
		}
		raw := token.Header[keyIDHeader]
		if raw == nil {
			return nil, fmt.Errorf("missing key ID header")
		}
		keyID := raw.(string)

		e.lock.RLock()
		defer e.lock.RUnlock()
		keyset, err := e.keysetByIDLocked(keyID)
		if err != nil {
			return nil, err
		}
		return keyset.privateKey.Public(), nil
	})

	if err != nil {
		return nil, fmt.Errorf("failed to verify token: %v", err)
	}

	claims, ok := token.Claims.(*structs.IdentityClaims)
	if !ok || !token.Valid {
		return nil, fmt.Errorf("failed to verify token: invalid token")
	}
	return claims, nil
}

// AddKey stores the key in the keystore and creates a new cipher for it.
func (e *Encrypter) AddKey(rootKey *structs.RootKey) error {

	// note: we don't lock the keyring here but inside addCipher
	// instead, so that we're not holding the lock while performing
	// local disk writes
	if err := e.addCipher(rootKey); err != nil {
		return err
	}
	if err := e.saveKeyToStore(rootKey); err != nil {
		return err
	}
	return nil
}

// addCipher stores the key in the keyring and creates a new cipher for it.
func (e *Encrypter) addCipher(rootKey *structs.RootKey) error {

	if rootKey == nil || rootKey.Meta == nil {
		return fmt.Errorf("missing metadata")
	}
	var aead cipher.AEAD

	switch rootKey.Meta.Algorithm {
	case structs.EncryptionAlgorithmAES256GCM:
		block, err := aes.NewCipher(rootKey.Key)
		if err != nil {
			return fmt.Errorf("could not create cipher: %v", err)
		}
		aead, err = cipher.NewGCM(block)
		if err != nil {
			return fmt.Errorf("could not create cipher: %v", err)
		}
	default:
		return fmt.Errorf("invalid algorithm %s", rootKey.Meta.Algorithm)
	}

	privateKey := ed25519.NewKeyFromSeed(rootKey.Key)

	e.lock.Lock()
	defer e.lock.Unlock()
	e.keyring[rootKey.Meta.KeyID] = &keyset{
		rootKey:    rootKey,
		cipher:     aead,
		privateKey: privateKey,
	}
	return nil
}

// GetKey retrieves the key material by ID from the keyring
func (e *Encrypter) GetKey(keyID string) ([]byte, error) {
	e.lock.RLock()
	defer e.lock.RUnlock()

	keyset, err := e.keysetByIDLocked(keyID)
	if err != nil {
		return nil, err
	}
	return keyset.rootKey.Key, nil
}

// activeKeySetLocked returns the keyset that belongs to the key marked as
// active in the state store (so that it's consistent with raft). The
// called must read-lock the keyring
func (e *Encrypter) activeKeySet() (*keyset, error) {
	store := e.srv.fsm.State()
	keyMeta, err := store.GetActiveRootKeyMeta(nil)
	if err != nil {
		return nil, err
	}
	if keyMeta == nil {
		return nil, fmt.Errorf("keyring has not been initialized yet")
	}
	e.lock.RLock()
	defer e.lock.RUnlock()
	return e.keysetByIDLocked(keyMeta.KeyID)
}

// keysetByIDLocked returns the keyset for the specified keyID. The
// caller must read-lock the keyring
func (e *Encrypter) keysetByIDLocked(keyID string) (*keyset, error) {
	keyset, ok := e.keyring[keyID]
	if !ok {
		return nil, fmt.Errorf("no such key %q in keyring", keyID)
	}
	return keyset, nil
}

// RemoveKey removes a key by ID from the keyring
func (e *Encrypter) RemoveKey(keyID string) error {
	e.lock.Lock()
	defer e.lock.Unlock()
	delete(e.keyring, keyID)
	return nil
}

// saveKeyToStore serializes a root key to the on-disk keystore.
func (e *Encrypter) saveKeyToStore(rootKey *structs.RootKey) error {

	kek, err := crypto.Bytes(32)
	if err != nil {
		return fmt.Errorf("failed to generate key wrapper key: %v", err)
	}
	wrapper, err := e.newKMSWrapper(rootKey.Meta.KeyID, kek)
	if err != nil {
		return fmt.Errorf("failed to create encryption wrapper: %v", err)
	}
	blob, err := wrapper.Encrypt(e.srv.shutdownCtx, rootKey.Key)
	if err != nil {
		return fmt.Errorf("failed to encrypt root key: %v", err)
	}

	kekWrapper := &structs.KeyEncryptionKeyWrapper{
		Meta:                       rootKey.Meta,
		EncryptedDataEncryptionKey: blob.Ciphertext,
		KeyEncryptionKey:           kek,
	}

	buf, err := json.Marshal(kekWrapper)
	if err != nil {
		return err
	}

	path := filepath.Join(e.keystorePath, rootKey.Meta.KeyID+nomadKeystoreExtension)
	err = os.WriteFile(path, buf, 0o600)
	if err != nil {
		return err
	}
	return nil
}

// loadKeyFromStore deserializes a root key from disk.
func (e *Encrypter) loadKeyFromStore(path string) (*structs.RootKey, error) {

	raw, err := os.ReadFile(path)
	if err != nil {
		return nil, err
	}

	kekWrapper := &structs.KeyEncryptionKeyWrapper{}
	if err := json.Unmarshal(raw, kekWrapper); err != nil {
		return nil, err
	}

	meta := kekWrapper.Meta
	if err = meta.Validate(); err != nil {
		return nil, err
	}

	// the errors that bubble up from this library can be a bit opaque, so make
	// sure we wrap them with as much context as possible
	wrapper, err := e.newKMSWrapper(meta.KeyID, kekWrapper.KeyEncryptionKey)
	if err != nil {
		return nil, fmt.Errorf("unable to create key wrapper cipher: %v", err)
	}
	key, err := wrapper.Decrypt(e.srv.shutdownCtx, &kms.BlobInfo{
		Ciphertext: kekWrapper.EncryptedDataEncryptionKey,
	})
	if err != nil {
		return nil, fmt.Errorf("unable to decrypt wrapped root key: %v", err)
	}

	return &structs.RootKey{
		Meta: meta,
		Key:  key,
	}, nil
}

// newKMSWrapper returns a go-kms-wrapping interface the caller can use to
// encrypt the RootKey with a key encryption key (KEK). This is a bit of
// security theatre for local on-disk key material, but gives us a shim for
// external KMS providers in the future.
func (e *Encrypter) newKMSWrapper(keyID string, kek []byte) (kms.Wrapper, error) {
	wrapper := aead.NewWrapper()
	wrapper.SetConfig(context.Background(),
		aead.WithAeadType(kms.AeadTypeAesGcm),
		aead.WithHashType(kms.HashTypeSha256),
		kms.WithKeyId(keyID),
	)
	err := wrapper.SetAesGcmKeyBytes(kek)
	if err != nil {
		return nil, err
	}
	return wrapper, nil
}

type KeyringReplicator struct {
	srv       *Server
	encrypter *Encrypter
	logger    log.Logger
	stopFn    context.CancelFunc
}

func NewKeyringReplicator(srv *Server, e *Encrypter) *KeyringReplicator {
	ctx, cancel := context.WithCancel(context.Background())
	repl := &KeyringReplicator{
		srv:       srv,
		encrypter: e,
		logger:    srv.logger.Named("keyring.replicator"),
		stopFn:    cancel,
	}
	go repl.run(ctx)
	return repl
}

// stop is provided for testing
func (krr *KeyringReplicator) stop() {
	krr.stopFn()
}

const keyringReplicationRate = 10

func (krr *KeyringReplicator) run(ctx context.Context) {
	limiter := rate.NewLimiter(keyringReplicationRate, keyringReplicationRate)
	krr.logger.Debug("starting encryption key replication")
	defer krr.logger.Debug("exiting key replication")

	retryErrTimer, stop := helper.NewSafeTimer(time.Second * 1)
	defer stop()

START:
	store := krr.srv.fsm.State()

	for {
		select {
		case <-krr.srv.shutdownCtx.Done():
			return
		case <-ctx.Done():
			return
		default:
			// Rate limit how often we attempt replication
			err := limiter.Wait(ctx)
			if err != nil {
				goto ERR_WAIT // rate limit exceeded
			}

			ws := store.NewWatchSet()
			iter, err := store.RootKeyMetas(ws)
			if err != nil {
				krr.logger.Error("failed to fetch keyring", "error", err)
				goto ERR_WAIT
			}

			for {
				raw := iter.Next()
				if raw == nil {
					break
				}

				keyMeta := raw.(*structs.RootKeyMeta)
				if key, err := krr.encrypter.GetKey(keyMeta.KeyID); err == nil && len(key) > 0 {
					// the key material is immutable so if we've already got it
					// we can move on to the next key
					continue
				}

				err := krr.replicateKey(ctx, keyMeta)
				if err != nil {
					// don't break the loop on an error, as we want to make sure
					// we've replicated any keys we can. the rate limiter will
					// prevent this case from sending excessive RPCs
					krr.logger.Error(err.Error(), "key", keyMeta.KeyID)
				}
			}
		}
	}

ERR_WAIT:
	retryErrTimer.Reset(1 * time.Second)

	select {
	case <-retryErrTimer.C:
		goto START
	case <-ctx.Done():
		return
	}

}

// replicateKey replicates a single key from peer servers that was present in
// the state store but missing from the keyring. Returns an error only if no
// peers have this key.
func (krr *KeyringReplicator) replicateKey(ctx context.Context, keyMeta *structs.RootKeyMeta) error {
	keyID := keyMeta.KeyID
	krr.logger.Debug("replicating new key", "id", keyID)

	getReq := &structs.KeyringGetRootKeyRequest{
		KeyID: keyID,
		QueryOptions: structs.QueryOptions{
			Region:        krr.srv.config.Region,
			MinQueryIndex: keyMeta.ModifyIndex - 1,
		},
	}
	getResp := &structs.KeyringGetRootKeyResponse{}
	err := krr.srv.RPC("Keyring.Get", getReq, getResp)

	if err != nil || getResp.Key == nil {
		// Key replication needs to tolerate leadership flapping. If a key is
		// rotated during a leadership transition, it's possible that the new
		// leader has not yet replicated the key from the old leader before the
		// transition. Ask all the other servers if they have it.
		krr.logger.Warn("failed to fetch key from current leader, trying peers",
			"key", keyID, "error", err)
		getReq.AllowStale = true
		for _, peer := range krr.getAllPeers() {
			err = krr.srv.forwardServer(peer, "Keyring.Get", getReq, getResp)
			if err == nil && getResp.Key != nil {
				break
			}
		}
		if getResp.Key == nil {
			krr.logger.Error("failed to fetch key from any peer",
				"key", keyID, "error", err)
			return fmt.Errorf("failed to fetch key from any peer: %v", err)
		}
	}

	err = krr.encrypter.AddKey(getResp.Key)
	if err != nil {
		return fmt.Errorf("failed to add key to keyring: %v", err)
	}

	krr.logger.Debug("added key", "key", keyID)
	return nil
}

// TODO: move this method into Server?
func (krr *KeyringReplicator) getAllPeers() []*serverParts {
	krr.srv.peerLock.RLock()
	defer krr.srv.peerLock.RUnlock()
	peers := make([]*serverParts, 0, len(krr.srv.localPeers))
	for _, peer := range krr.srv.localPeers {
		peers = append(peers, peer.Copy())
	}
	return peers
}