open-nomad/nomad/keyring_endpoint.go
Tim Gross 3a811ac5e7
keyring: fixes for keyring replication on cluster join (#14987)
* keyring: don't unblock early if rate limit burst exceeded

The rate limiter returns an error and unblocks early if its burst limit is
exceeded (unless the burst limit is Inf). Ensure we're not unblocking early,
otherwise we'll only slow down the cases where we're already pausing to make
external RPC requests.

* keyring: set MinQueryIndex on stale queries

When keyring replication makes a stale query to non-leader peers to find a key
the leader doesn't have, we need to make sure the peer we're querying has had a
chance to catch up to the most current index for that key. Otherwise it's
possible for newly-added servers to query another newly-added server and get a
non-error nil response for that key ID.

Ensure that we're setting the correct reply index in the blocking query.

Note that the "not found" case does not return an error, just an empty key. So
as a belt-and-suspenders, update the handling of empty responses so that we
don't break the loop early if we hit a server that doesn't have the key.

* test for adding new servers to keyring

* leader: initialize keyring after we have consistent reads

Wait until we're sure the FSM is current before we try to initialize the
keyring.

Also, if a key is rotated immediately following a leader election, plans that
are in-flight may get signed before the new leader has the key. Allow for a
short timeout-and-retry to avoid rejecting plans
2022-10-21 12:33:16 -04:00

336 lines
8.8 KiB
Go

package nomad
import (
"fmt"
"time"
metrics "github.com/armon/go-metrics"
"github.com/hashicorp/go-hclog"
memdb "github.com/hashicorp/go-memdb"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/nomad/state"
"github.com/hashicorp/nomad/nomad/structs"
)
// Keyring endpoint serves RPCs for root key management
type Keyring struct {
srv *Server
logger hclog.Logger
encrypter *Encrypter
ctx *RPCContext // context for connection, to check TLS role
}
func (k *Keyring) Rotate(args *structs.KeyringRotateRootKeyRequest, reply *structs.KeyringRotateRootKeyResponse) error {
if done, err := k.srv.forward("Keyring.Rotate", args, args, reply); done {
return err
}
defer metrics.MeasureSince([]string{"nomad", "keyring", "rotate"}, time.Now())
if aclObj, err := k.srv.ResolveToken(args.AuthToken); err != nil {
return err
} else if aclObj != nil && !aclObj.IsManagement() {
return structs.ErrPermissionDenied
}
if args.Algorithm == "" {
args.Algorithm = structs.EncryptionAlgorithmAES256GCM
}
rootKey, err := structs.NewRootKey(args.Algorithm)
if err != nil {
return err
}
rootKey.Meta.SetActive()
// make sure it's been added to the local keystore before we write
// it to raft, so that followers don't try to Get a key that
// hasn't yet been written to disk
err = k.encrypter.AddKey(rootKey)
if err != nil {
return err
}
// Update metadata via Raft so followers can retrieve this key
req := structs.KeyringUpdateRootKeyMetaRequest{
RootKeyMeta: rootKey.Meta,
Rekey: args.Full,
WriteRequest: args.WriteRequest,
}
out, index, err := k.srv.raftApply(structs.RootKeyMetaUpsertRequestType, req)
if err != nil {
return err
}
if err, ok := out.(error); ok && err != nil {
return err
}
reply.Key = rootKey.Meta
reply.Index = index
if args.Full {
// like most core jobs, we don't commit this to raft b/c it's not
// going to be periodically recreated and the ACL is from this leader
eval := &structs.Evaluation{
ID: uuid.Generate(),
Namespace: "-",
Priority: structs.CoreJobPriority,
Type: structs.JobTypeCore,
TriggeredBy: structs.EvalTriggerJobRegister,
JobID: structs.CoreJobVariablesRekey,
Status: structs.EvalStatusPending,
ModifyIndex: index,
LeaderACL: k.srv.getLeaderAcl(),
}
k.srv.evalBroker.Enqueue(eval)
}
return nil
}
func (k *Keyring) List(args *structs.KeyringListRootKeyMetaRequest, reply *structs.KeyringListRootKeyMetaResponse) error {
if done, err := k.srv.forward("Keyring.List", args, args, reply); done {
return err
}
defer metrics.MeasureSince([]string{"nomad", "keyring", "list"}, time.Now())
// we need to allow both humans with management tokens and
// non-leader servers to list keys, in order to support
// replication
err := validateTLSCertificateLevel(k.srv, k.ctx, tlsCertificateLevelServer)
if err != nil {
if aclObj, err := k.srv.ResolveToken(args.AuthToken); err != nil {
return err
} else if aclObj != nil && !aclObj.IsManagement() {
return structs.ErrPermissionDenied
}
}
// Setup the blocking query
opts := blockingOptions{
queryOpts: &args.QueryOptions,
queryMeta: &reply.QueryMeta,
run: func(ws memdb.WatchSet, s *state.StateStore) error {
// retrieve all the key metadata
snap, err := k.srv.fsm.State().Snapshot()
if err != nil {
return err
}
iter, err := snap.RootKeyMetas(ws)
if err != nil {
return err
}
keys := []*structs.RootKeyMeta{}
for {
raw := iter.Next()
if raw == nil {
break
}
keyMeta := raw.(*structs.RootKeyMeta)
keys = append(keys, keyMeta)
}
reply.Keys = keys
return k.srv.replySetIndex(state.TableRootKeyMeta, &reply.QueryMeta)
},
}
return k.srv.blockingRPC(&opts)
}
// Update updates an existing key in the keyring, including both the
// key material and metadata.
func (k *Keyring) Update(args *structs.KeyringUpdateRootKeyRequest, reply *structs.KeyringUpdateRootKeyResponse) error {
if done, err := k.srv.forward("Keyring.Update", args, args, reply); done {
return err
}
defer metrics.MeasureSince([]string{"nomad", "keyring", "update"}, time.Now())
if aclObj, err := k.srv.ResolveToken(args.AuthToken); err != nil {
return err
} else if aclObj != nil && !aclObj.IsManagement() {
return structs.ErrPermissionDenied
}
err := k.validateUpdate(args)
if err != nil {
return err
}
// make sure it's been added to the local keystore before we write
// it to raft, so that followers don't try to Get a key that
// hasn't yet been written to disk
err = k.encrypter.AddKey(args.RootKey)
if err != nil {
return err
}
// unwrap the request to turn it into a meta update only
metaReq := &structs.KeyringUpdateRootKeyMetaRequest{
RootKeyMeta: args.RootKey.Meta,
WriteRequest: args.WriteRequest,
}
// update the metadata via Raft
out, index, err := k.srv.raftApply(structs.RootKeyMetaUpsertRequestType, metaReq)
if err != nil {
return err
}
if err, ok := out.(error); ok && err != nil {
return err
}
reply.Index = index
return nil
}
// validateUpdate validates both the request and that any change to an
// existing key is valid
func (k *Keyring) validateUpdate(args *structs.KeyringUpdateRootKeyRequest) error {
err := args.RootKey.Meta.Validate()
if err != nil {
return err
}
if len(args.RootKey.Key) == 0 {
return fmt.Errorf("root key material is required")
}
// lookup any existing key and validate the update
snap, err := k.srv.fsm.State().Snapshot()
if err != nil {
return err
}
ws := memdb.NewWatchSet()
keyMeta, err := snap.RootKeyMetaByID(ws, args.RootKey.Meta.KeyID)
if err != nil {
return err
}
if keyMeta != nil && keyMeta.Algorithm != args.RootKey.Meta.Algorithm {
return fmt.Errorf("root key algorithm cannot be changed after a key is created")
}
return nil
}
// Get retrieves an existing key from the keyring, including both the
// key material and metadata. It is used only for replication.
func (k *Keyring) Get(args *structs.KeyringGetRootKeyRequest, reply *structs.KeyringGetRootKeyResponse) error {
// ensure that only another server can make this request
err := validateTLSCertificateLevel(k.srv, k.ctx, tlsCertificateLevelServer)
if err != nil {
return err
}
if done, err := k.srv.forward("Keyring.Get", args, args, reply); done {
return err
}
defer metrics.MeasureSince([]string{"nomad", "keyring", "get"}, time.Now())
if args.KeyID == "" {
return fmt.Errorf("root key ID is required")
}
// Setup the blocking query
opts := blockingOptions{
queryOpts: &args.QueryOptions,
queryMeta: &reply.QueryMeta,
run: func(ws memdb.WatchSet, s *state.StateStore) error {
// retrieve the key metadata
snap, err := k.srv.fsm.State().Snapshot()
if err != nil {
return err
}
keyMeta, err := snap.RootKeyMetaByID(ws, args.KeyID)
if err != nil {
return err
}
if keyMeta == nil {
return k.srv.replySetIndex(state.TableRootKeyMeta, &reply.QueryMeta)
}
// retrieve the key material from the keyring
key, err := k.encrypter.GetKey(keyMeta.KeyID)
if err != nil {
return err
}
rootKey := &structs.RootKey{
Meta: keyMeta,
Key: key,
}
reply.Key = rootKey
// Use the last index that affected the policy table
index, err := s.Index(state.TableRootKeyMeta)
if err != nil {
return err
}
// Ensure we never set the index to zero, otherwise a blocking query
// cannot be used. We floor the index at one, since realistically
// the first write must have a higher index.
if index == 0 {
index = 1
}
reply.Index = index
return nil
},
}
return k.srv.blockingRPC(&opts)
}
func (k *Keyring) Delete(args *structs.KeyringDeleteRootKeyRequest, reply *structs.KeyringDeleteRootKeyResponse) error {
if done, err := k.srv.forward("Keyring.Delete", args, args, reply); done {
return err
}
defer metrics.MeasureSince([]string{"nomad", "keyring", "delete"}, time.Now())
if aclObj, err := k.srv.ResolveToken(args.AuthToken); err != nil {
return err
} else if aclObj != nil && !aclObj.IsManagement() {
return structs.ErrPermissionDenied
}
if args.KeyID == "" {
return fmt.Errorf("root key ID is required")
}
// lookup any existing key and validate the delete
snap, err := k.srv.fsm.State().Snapshot()
if err != nil {
return err
}
ws := memdb.NewWatchSet()
keyMeta, err := snap.RootKeyMetaByID(ws, args.KeyID)
if err != nil {
return err
}
if keyMeta == nil {
return nil // safe to bail out early
}
if keyMeta.Active() {
return fmt.Errorf("active root key cannot be deleted - call rotate first")
}
// update via Raft
out, index, err := k.srv.raftApply(structs.RootKeyMetaDeleteRequestType, args)
if err != nil {
return err
}
if err, ok := out.(error); ok && err != nil {
return err
}
// remove the key from the keyring too
k.encrypter.RemoveKey(args.KeyID)
reply.Index = index
return nil
}