open-nomad/nomad/keyring_endpoint.go
Tim Gross 0e1b554299
handle FSM.Apply errors in raftApply (#16287)
The signature of the `raftApply` function requires that the caller unwrap the
first returned value (the response from `FSM.Apply`) to see if it's an
error. This puts the burden on the caller to remember to check two different
places for errors, and we've done so inconsistently.

Update `raftApply` to do the unwrapping for us and return any `FSM.Apply` error
as the error value. Similar work was done in Consul in
https://github.com/hashicorp/consul/pull/9991. This eliminates some boilerplate
and surfaces a few minor bugs in the process:

* job deregistrations of already-GC'd jobs were still emitting evals
* reconcile job summaries does not return scheduler errors
* node updates did not report errors associated with inconsistent service
  discovery or CSI plugin states

Note that although _most_ of the `FSM.Apply` functions return only errors (which
makes it tempting to remove the first return value entirely), there are few that
return `bool` for some reason and Variables relies on the response value for
proper CAS checking.
2023-03-02 13:51:09 -05:00

360 lines
9.5 KiB
Go

package nomad
import (
"fmt"
"time"
"github.com/armon/go-metrics"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-memdb"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/nomad/state"
"github.com/hashicorp/nomad/nomad/structs"
)
// Keyring endpoint serves RPCs for root key management
type Keyring struct {
srv *Server
ctx *RPCContext
logger hclog.Logger
encrypter *Encrypter
}
func NewKeyringEndpoint(srv *Server, ctx *RPCContext, enc *Encrypter) *Keyring {
return &Keyring{srv: srv, ctx: ctx, logger: srv.logger.Named("keyring"), encrypter: enc}
}
func (k *Keyring) Rotate(args *structs.KeyringRotateRootKeyRequest, reply *structs.KeyringRotateRootKeyResponse) error {
authErr := k.srv.Authenticate(k.ctx, args)
if done, err := k.srv.forward("Keyring.Rotate", args, args, reply); done {
return err
}
k.srv.MeasureRPCRate("keyring", structs.RateMetricWrite, args)
if authErr != nil {
return structs.ErrPermissionDenied
}
defer metrics.MeasureSince([]string{"nomad", "keyring", "rotate"}, time.Now())
if aclObj, err := k.srv.ResolveACL(args); err != nil {
return err
} else if aclObj != nil && !aclObj.IsManagement() {
return structs.ErrPermissionDenied
}
if args.Algorithm == "" {
args.Algorithm = structs.EncryptionAlgorithmAES256GCM
}
rootKey, err := structs.NewRootKey(args.Algorithm)
if err != nil {
return err
}
rootKey.Meta.SetActive()
// make sure it's been added to the local keystore before we write
// it to raft, so that followers don't try to Get a key that
// hasn't yet been written to disk
err = k.encrypter.AddKey(rootKey)
if err != nil {
return err
}
// Update metadata via Raft so followers can retrieve this key
req := structs.KeyringUpdateRootKeyMetaRequest{
RootKeyMeta: rootKey.Meta,
Rekey: args.Full,
WriteRequest: args.WriteRequest,
}
_, index, err := k.srv.raftApply(structs.RootKeyMetaUpsertRequestType, req)
if err != nil {
return err
}
reply.Key = rootKey.Meta
reply.Index = index
if args.Full {
// like most core jobs, we don't commit this to raft b/c it's not
// going to be periodically recreated and the ACL is from this leader
eval := &structs.Evaluation{
ID: uuid.Generate(),
Namespace: "-",
Priority: structs.CoreJobPriority,
Type: structs.JobTypeCore,
TriggeredBy: structs.EvalTriggerJobRegister,
JobID: structs.CoreJobVariablesRekey,
Status: structs.EvalStatusPending,
ModifyIndex: index,
LeaderACL: k.srv.getLeaderAcl(),
}
k.srv.evalBroker.Enqueue(eval)
}
return nil
}
func (k *Keyring) List(args *structs.KeyringListRootKeyMetaRequest, reply *structs.KeyringListRootKeyMetaResponse) error {
authErr := k.srv.Authenticate(k.ctx, args)
if done, err := k.srv.forward("Keyring.List", args, args, reply); done {
return err
}
k.srv.MeasureRPCRate("keyring", structs.RateMetricList, args)
if authErr != nil {
return structs.ErrPermissionDenied
}
defer metrics.MeasureSince([]string{"nomad", "keyring", "list"}, time.Now())
// we need to allow both humans with management tokens and
// non-leader servers to list keys, in order to support
// replication
err := validateTLSCertificateLevel(k.srv, k.ctx, tlsCertificateLevelServer)
if err != nil {
if aclObj, err := k.srv.ResolveACL(args); err != nil {
return err
} else if aclObj != nil && !aclObj.IsManagement() {
return structs.ErrPermissionDenied
}
}
// Setup the blocking query
opts := blockingOptions{
queryOpts: &args.QueryOptions,
queryMeta: &reply.QueryMeta,
run: func(ws memdb.WatchSet, s *state.StateStore) error {
// retrieve all the key metadata
snap, err := k.srv.fsm.State().Snapshot()
if err != nil {
return err
}
iter, err := snap.RootKeyMetas(ws)
if err != nil {
return err
}
keys := []*structs.RootKeyMeta{}
for {
raw := iter.Next()
if raw == nil {
break
}
keyMeta := raw.(*structs.RootKeyMeta)
keys = append(keys, keyMeta)
}
reply.Keys = keys
return k.srv.replySetIndex(state.TableRootKeyMeta, &reply.QueryMeta)
},
}
return k.srv.blockingRPC(&opts)
}
// Update updates an existing key in the keyring, including both the
// key material and metadata.
func (k *Keyring) Update(args *structs.KeyringUpdateRootKeyRequest, reply *structs.KeyringUpdateRootKeyResponse) error {
authErr := k.srv.Authenticate(k.ctx, args)
if done, err := k.srv.forward("Keyring.Update", args, args, reply); done {
return err
}
k.srv.MeasureRPCRate("keyring", structs.RateMetricWrite, args)
if authErr != nil {
return structs.ErrPermissionDenied
}
defer metrics.MeasureSince([]string{"nomad", "keyring", "update"}, time.Now())
if aclObj, err := k.srv.ResolveACL(args); err != nil {
return err
} else if aclObj != nil && !aclObj.IsManagement() {
return structs.ErrPermissionDenied
}
err := k.validateUpdate(args)
if err != nil {
return err
}
// make sure it's been added to the local keystore before we write
// it to raft, so that followers don't try to Get a key that
// hasn't yet been written to disk
err = k.encrypter.AddKey(args.RootKey)
if err != nil {
return err
}
// unwrap the request to turn it into a meta update only
metaReq := &structs.KeyringUpdateRootKeyMetaRequest{
RootKeyMeta: args.RootKey.Meta,
WriteRequest: args.WriteRequest,
}
// update the metadata via Raft
_, index, err := k.srv.raftApply(structs.RootKeyMetaUpsertRequestType, metaReq)
if err != nil {
return err
}
reply.Index = index
return nil
}
// validateUpdate validates both the request and that any change to an
// existing key is valid
func (k *Keyring) validateUpdate(args *structs.KeyringUpdateRootKeyRequest) error {
err := args.RootKey.Meta.Validate()
if err != nil {
return err
}
if len(args.RootKey.Key) == 0 {
return fmt.Errorf("root key material is required")
}
// lookup any existing key and validate the update
snap, err := k.srv.fsm.State().Snapshot()
if err != nil {
return err
}
ws := memdb.NewWatchSet()
keyMeta, err := snap.RootKeyMetaByID(ws, args.RootKey.Meta.KeyID)
if err != nil {
return err
}
if keyMeta != nil && keyMeta.Algorithm != args.RootKey.Meta.Algorithm {
return fmt.Errorf("root key algorithm cannot be changed after a key is created")
}
return nil
}
// Get retrieves an existing key from the keyring, including both the
// key material and metadata. It is used only for replication.
func (k *Keyring) Get(args *structs.KeyringGetRootKeyRequest, reply *structs.KeyringGetRootKeyResponse) error {
authErr := k.srv.Authenticate(k.ctx, args)
// ensure that only another server can make this request
err := validateTLSCertificateLevel(k.srv, k.ctx, tlsCertificateLevelServer)
if err != nil {
return err
}
if done, err := k.srv.forward("Keyring.Get", args, args, reply); done {
return err
}
k.srv.MeasureRPCRate("keyring", structs.RateMetricRead, args)
if authErr != nil {
return structs.ErrPermissionDenied
}
defer metrics.MeasureSince([]string{"nomad", "keyring", "get"}, time.Now())
if args.KeyID == "" {
return fmt.Errorf("root key ID is required")
}
// Setup the blocking query
opts := blockingOptions{
queryOpts: &args.QueryOptions,
queryMeta: &reply.QueryMeta,
run: func(ws memdb.WatchSet, s *state.StateStore) error {
// retrieve the key metadata
snap, err := k.srv.fsm.State().Snapshot()
if err != nil {
return err
}
keyMeta, err := snap.RootKeyMetaByID(ws, args.KeyID)
if err != nil {
return err
}
if keyMeta == nil {
return k.srv.replySetIndex(state.TableRootKeyMeta, &reply.QueryMeta)
}
// retrieve the key material from the keyring
key, err := k.encrypter.GetKey(keyMeta.KeyID)
if err != nil {
return err
}
rootKey := &structs.RootKey{
Meta: keyMeta,
Key: key,
}
reply.Key = rootKey
// Use the last index that affected the policy table
index, err := s.Index(state.TableRootKeyMeta)
if err != nil {
return err
}
// Ensure we never set the index to zero, otherwise a blocking query
// cannot be used. We floor the index at one, since realistically
// the first write must have a higher index.
if index == 0 {
index = 1
}
reply.Index = index
return nil
},
}
return k.srv.blockingRPC(&opts)
}
func (k *Keyring) Delete(args *structs.KeyringDeleteRootKeyRequest, reply *structs.KeyringDeleteRootKeyResponse) error {
authErr := k.srv.Authenticate(k.ctx, args)
if done, err := k.srv.forward("Keyring.Delete", args, args, reply); done {
return err
}
k.srv.MeasureRPCRate("keyring", structs.RateMetricWrite, args)
if authErr != nil {
return structs.ErrPermissionDenied
}
defer metrics.MeasureSince([]string{"nomad", "keyring", "delete"}, time.Now())
if aclObj, err := k.srv.ResolveACL(args); err != nil {
return err
} else if aclObj != nil && !aclObj.IsManagement() {
return structs.ErrPermissionDenied
}
if args.KeyID == "" {
return fmt.Errorf("root key ID is required")
}
// lookup any existing key and validate the delete
snap, err := k.srv.fsm.State().Snapshot()
if err != nil {
return err
}
ws := memdb.NewWatchSet()
keyMeta, err := snap.RootKeyMetaByID(ws, args.KeyID)
if err != nil {
return err
}
if keyMeta == nil {
return nil // safe to bail out early
}
if keyMeta.Active() {
return fmt.Errorf("active root key cannot be deleted - call rotate first")
}
// update via Raft
_, index, err := k.srv.raftApply(structs.RootKeyMetaDeleteRequestType, args)
if err != nil {
return err
}
// remove the key from the keyring too
k.encrypter.RemoveKey(args.KeyID)
reply.Index = index
return nil
}