handle `FSM.Apply` errors in `raftApply` (#16287)

The signature of the `raftApply` function requires that the caller unwrap the
first returned value (the response from `FSM.Apply`) to see if it's an
error. This puts the burden on the caller to remember to check two different
places for errors, and we've done so inconsistently.

Update `raftApply` to do the unwrapping for us and return any `FSM.Apply` error
as the error value. Similar work was done in Consul in
https://github.com/hashicorp/consul/pull/9991. This eliminates some boilerplate
and surfaces a few minor bugs in the process:

* job deregistrations of already-GC'd jobs were still emitting evals
* reconcile job summaries does not return scheduler errors
* node updates did not report errors associated with inconsistent service
  discovery or CSI plugin states

Note that although _most_ of the `FSM.Apply` functions return only errors (which
makes it tempting to remove the first return value entirely), there are few that
return `bool` for some reason and Variables relies on the response value for
proper CAS checking.
This commit is contained in:
Tim Gross 2023-03-02 13:51:09 -05:00 committed by GitHub
parent f3b5952c3e
commit 0e1b554299
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 65 additions and 192 deletions

11
.changelog/16287.txt Normal file
View File

@ -0,0 +1,11 @@
```release-note:bug
server: Fixed a bug where deregistering a job that was already garbage collected would create a new evaluation
```
```release-note:bug
server: Fixed a bug where the `system reconcile summaries` command and API would not return any scheduler-related errors
```
```release-note:bug
server: Fixed a bug where node updates that produced errors from service discovery or CSI plugin updates were not logged
```

View File

@ -1554,7 +1554,7 @@ func TestJobs_Deregister(t *testing.T) {
must.NoError(t, err)
assertWriteMeta(t, wm)
// Attempting delete on non-existing job returns an error
// Attempting delete on non-existing job does not return an error
_, _, err = jobs.Deregister("nope", false, nil)
must.NoError(t, err)

View File

@ -13,6 +13,7 @@ import (
)
func queryNodeList(t *testing.T, nodes *Nodes) ([]*NodeListStub, *QueryMeta) {
t.Helper()
var (
nodeListStub []*NodeListStub
queryMeta *QueryMeta

View File

@ -1342,16 +1342,11 @@ func (a *ACL) UpsertRoles(
}
// Update via Raft.
out, index, err := a.srv.raftApply(structs.ACLRolesUpsertRequestType, args)
_, index, err := a.srv.raftApply(structs.ACLRolesUpsertRequestType, args)
if err != nil {
return err
}
// Check if the FSM response, which is an interface, contains an error.
if err, ok := out.(error); ok && err != nil {
return err
}
// Populate the response. We do a lookup against the state to pick up the
// proper create / modify times.
stateSnapshot, err = a.srv.State().Snapshot()
@ -1413,16 +1408,11 @@ func (a *ACL) DeleteRolesByID(
}
// Update via Raft.
out, index, err := a.srv.raftApply(structs.ACLRolesDeleteByIDRequestType, args)
_, index, err := a.srv.raftApply(structs.ACLRolesDeleteByIDRequestType, args)
if err != nil {
return err
}
// Check if the FSM response, which is an interface, contains an error.
if err, ok := out.(error); ok && err != nil {
return err
}
// Update the index. There is no need to floor this as we are writing to
// state and therefore will get a non-zero index response.
reply.Index = index
@ -1899,16 +1889,11 @@ func (a *ACL) UpsertAuthMethods(
}
// Update via Raft
out, index, err := a.srv.raftApply(structs.ACLAuthMethodsUpsertRequestType, args)
_, index, err := a.srv.raftApply(structs.ACLAuthMethodsUpsertRequestType, args)
if err != nil {
return err
}
// Check if the FSM response, which is an interface, contains an error.
if err, ok := out.(error); ok && err != nil {
return err
}
// Populate the response. We do a lookup against the state to pick up the
// proper create / modify times.
stateSnapshot, err = a.srv.State().Snapshot()
@ -1972,16 +1957,11 @@ func (a *ACL) DeleteAuthMethods(
}
// Update via Raft
out, index, err := a.srv.raftApply(structs.ACLAuthMethodsDeleteRequestType, args)
_, index, err := a.srv.raftApply(structs.ACLAuthMethodsDeleteRequestType, args)
if err != nil {
return err
}
// Check if the FSM response, which is an interface, contains an error.
if err, ok := out.(error); ok && err != nil {
return err
}
// Update the index
reply.Index = index
return nil
@ -2278,16 +2258,11 @@ func (a *ACL) UpsertBindingRules(
}
// Update via Raft.
out, index, err := a.srv.raftApply(structs.ACLBindingRulesUpsertRequestType, args)
_, index, err := a.srv.raftApply(structs.ACLBindingRulesUpsertRequestType, args)
if err != nil {
return err
}
// Check if the FSM response, which is an interface, contains an error.
if err, ok := out.(error); ok && err != nil {
return err
}
// Populate the response. We do a lookup against the state to pick up the
// proper create / modify indexes.
stateSnapshot, err = a.srv.State().Snapshot()
@ -2353,16 +2328,11 @@ func (a *ACL) DeleteBindingRules(
}
// Update via Raft.
out, index, err := a.srv.raftApply(structs.ACLBindingRulesDeleteRequestType, args)
_, index, err := a.srv.raftApply(structs.ACLBindingRulesDeleteRequestType, args)
if err != nil {
return err
}
// Check if the FSM response, which is an interface, contains an error.
if err, ok := out.(error); ok && err != nil {
return err
}
// Update the index
reply.Index = index
return nil

View File

@ -354,14 +354,11 @@ func (v *CSIVolume) Register(args *structs.CSIVolumeRegisterRequest, reply *stru
}
}
resp, index, err := v.srv.raftApply(structs.CSIVolumeRegisterRequestType, args)
_, index, err := v.srv.raftApply(structs.CSIVolumeRegisterRequestType, args)
if err != nil {
v.logger.Error("csi raft apply failed", "error", err, "method", "register")
return err
}
if respErr, ok := resp.(error); ok {
return respErr
}
reply.Index = index
v.srv.setQueryMeta(&reply.QueryMeta)
@ -397,14 +394,11 @@ func (v *CSIVolume) Deregister(args *structs.CSIVolumeDeregisterRequest, reply *
return fmt.Errorf("missing volume IDs")
}
resp, index, err := v.srv.raftApply(structs.CSIVolumeDeregisterRequestType, args)
_, index, err := v.srv.raftApply(structs.CSIVolumeDeregisterRequestType, args)
if err != nil {
v.logger.Error("csi raft apply failed", "error", err, "method", "deregister")
return err
}
if respErr, ok := resp.(error); ok {
return respErr
}
reply.Index = index
v.srv.setQueryMeta(&reply.QueryMeta)
@ -458,14 +452,11 @@ func (v *CSIVolume) Claim(args *structs.CSIVolumeClaimRequest, reply *structs.CS
args.NodeID = alloc.NodeID
}
resp, index, err := v.srv.raftApply(structs.CSIVolumeClaimRequestType, args)
_, index, err := v.srv.raftApply(structs.CSIVolumeClaimRequestType, args)
if err != nil {
v.logger.Error("csi raft apply failed", "error", err, "method", "claim")
return err
}
if respErr, ok := resp.(error); ok {
return respErr
}
if isNewClaim {
// if this is a new claim, add a Volume and PublishContext from the
@ -931,14 +922,11 @@ func (v *CSIVolume) checkpointClaim(vol *structs.CSIVolume, claim *structs.CSIVo
Namespace: vol.Namespace,
},
}
resp, index, err := v.srv.raftApply(structs.CSIVolumeClaimRequestType, req)
_, index, err := v.srv.raftApply(structs.CSIVolumeClaimRequestType, req)
if err != nil {
v.logger.Error("csi raft apply failed", "error", err)
return err
}
if respErr, ok := resp.(error); ok {
return respErr
}
vol.ModifyIndex = index
return nil
}
@ -1023,13 +1011,10 @@ func (v *CSIVolume) Create(args *structs.CSIVolumeCreateRequest, reply *structs.
}
}
resp, index, err := v.srv.raftApply(structs.CSIVolumeRegisterRequestType, regArgs)
_, index, err := v.srv.raftApply(structs.CSIVolumeRegisterRequestType, regArgs)
if err != nil {
v.logger.Error("csi raft apply failed", "error", err, "method", "register")
return err
}
if respErr, ok := resp.(error); ok {
multierror.Append(&mErr, respErr)
multierror.Append(&mErr, err)
}
err = mErr.ErrorOrNil()
@ -1123,14 +1108,11 @@ func (v *CSIVolume) Delete(args *structs.CSIVolumeDeleteRequest, reply *structs.
VolumeIDs: args.VolumeIDs,
WriteRequest: args.WriteRequest,
}
resp, index, err := v.srv.raftApply(structs.CSIVolumeDeregisterRequestType, deregArgs)
_, index, err := v.srv.raftApply(structs.CSIVolumeDeregisterRequestType, deregArgs)
if err != nil {
v.logger.Error("csi raft apply failed", "error", err, "method", "deregister")
return err
}
if respErr, ok := resp.(error); ok {
return respErr
}
reply.Index = index
v.srv.setQueryMeta(&reply.QueryMeta)
@ -1611,16 +1593,12 @@ func (v *CSIPlugin) Delete(args *structs.CSIPluginDeleteRequest, reply *structs.
return fmt.Errorf("missing plugin ID")
}
resp, index, err := v.srv.raftApply(structs.CSIPluginDeleteRequestType, args)
_, index, err := v.srv.raftApply(structs.CSIPluginDeleteRequestType, args)
if err != nil {
v.logger.Error("csi raft apply failed", "error", err, "method", "delete")
return err
}
if respErr, ok := resp.(error); ok {
return respErr
}
reply.Index = index
v.srv.setQueryMeta(&reply.QueryMeta)
return nil

View File

@ -28,8 +28,8 @@ func (d drainerShim) NodesDrainComplete(nodes []string, event *structs.NodeEvent
}
}
resp, index, err := d.s.raftApply(structs.BatchNodeUpdateDrainRequestType, args)
return d.convertApplyErrors(resp, index, err)
_, index, err := d.s.raftApply(structs.BatchNodeUpdateDrainRequestType, args)
return index, err
}
func (d drainerShim) AllocUpdateDesiredTransition(allocs map[string]*structs.DesiredTransition, evals []*structs.Evaluation) (uint64, error) {
@ -38,19 +38,6 @@ func (d drainerShim) AllocUpdateDesiredTransition(allocs map[string]*structs.Des
Evals: evals,
WriteRequest: structs.WriteRequest{Region: d.s.config.Region},
}
resp, index, err := d.s.raftApply(structs.AllocUpdateDesiredTransitionRequestType, args)
return d.convertApplyErrors(resp, index, err)
}
// convertApplyErrors parses the results of a raftApply and returns the index at
// which it was applied and any error that occurred. Raft Apply returns two
// separate errors, Raft library errors and user returned errors from the FSM.
// This helper, joins the errors by inspecting the applyResponse for an error.
func (d drainerShim) convertApplyErrors(applyResp interface{}, index uint64, err error) (uint64, error) {
if applyResp != nil {
if fsmErr, ok := applyResp.(error); ok && fsmErr != nil {
return index, fsmErr
}
}
_, index, err := d.s.raftApply(structs.AllocUpdateDesiredTransitionRequestType, args)
return index, err
}

View File

@ -376,13 +376,9 @@ func (j *Job) Register(args *structs.JobRegisterRequest, reply *structs.JobRegis
args.Deployment = j.multiregionCreateDeployment(job, eval)
// Commit this update via Raft
fsmErr, index, err := j.srv.raftApply(structs.JobRegisterRequestType, args)
if err, ok := fsmErr.(error); ok && err != nil {
j.logger.Error("registering job failed", "error", err, "fsm", true)
return err
}
_, index, err := j.srv.raftApply(structs.JobRegisterRequestType, args)
if err != nil {
j.logger.Error("registering job failed", "error", err, "raft", true)
j.logger.Error("registering job failed", "error", err)
return err
}
@ -850,6 +846,9 @@ func (j *Job) Deregister(args *structs.JobDeregisterRequest, reply *structs.JobD
if err != nil {
return err
}
if job == nil {
return nil
}
var eval *structs.Evaluation
@ -858,7 +857,7 @@ func (j *Job) Deregister(args *structs.JobDeregisterRequest, reply *structs.JobD
now := time.Now().UnixNano()
// If the job is periodic or parameterized, we don't create an eval.
if job == nil || !(job.IsPeriodic() || job.IsParameterized()) {
if !(job.IsPeriodic() || job.IsParameterized()) {
// The evaluation priority is determined by several factors. It
// defaults to the job default priority and is overridden by the
@ -867,7 +866,7 @@ func (j *Job) Deregister(args *structs.JobDeregisterRequest, reply *structs.JobD
// If the user supplied an eval priority override, we subsequently
// use this.
priority := structs.JobDefaultPriority
if job != nil {
if job.Priority > 0 {
priority = job.Priority
}
if args.EvalPriority > 0 {
@ -2027,13 +2026,9 @@ func (j *Job) Dispatch(args *structs.JobDispatchRequest, reply *structs.JobDispa
}
// Commit this update via Raft
fsmErr, jobCreateIndex, err := j.srv.raftApply(structs.JobRegisterRequestType, regReq)
if err, ok := fsmErr.(error); ok && err != nil {
j.logger.Error("dispatched job register failed", "error", err, "fsm", true)
return err
}
_, jobCreateIndex, err := j.srv.raftApply(structs.JobRegisterRequestType, regReq)
if err != nil {
j.logger.Error("dispatched job register failed", "error", err, "raft", true)
j.logger.Error("dispatched job register failed", "error")
return err
}

View File

@ -3587,13 +3587,11 @@ func TestJobEndpoint_Deregister_ACL(t *testing.T) {
require.NotZero(eval.CreateTime)
require.NotZero(eval.ModifyTime)
// Deregistration is not idempotent, produces a new eval after the job is
// deregistered. TODO(langmartin) make it idempotent.
// Deregistration is idempotent
var validResp2 structs.JobDeregisterResponse
err = msgpackrpc.CallWithCodec(codec, "Job.Deregister", req, &validResp2)
require.NoError(err)
require.NotEqual("", validResp2.EvalID)
require.NotEqual(validResp.EvalID, validResp2.EvalID)
must.NoError(t, err)
must.Eq(t, "", validResp2.EvalID)
}
func TestJobEndpoint_Deregister_Nonexistent(t *testing.T) {
@ -3616,51 +3614,15 @@ func TestJobEndpoint_Deregister_Nonexistent(t *testing.T) {
},
}
var resp2 structs.JobDeregisterResponse
if err := msgpackrpc.CallWithCodec(codec, "Job.Deregister", dereg, &resp2); err != nil {
t.Fatalf("err: %v", err)
}
if resp2.JobModifyIndex == 0 {
t.Fatalf("bad index: %d", resp2.Index)
}
must.NoError(t, msgpackrpc.CallWithCodec(codec, "Job.Deregister", dereg, &resp2))
must.Eq(t, 0, resp2.JobModifyIndex, must.Sprint("expected no modify index"))
// Lookup the evaluation
state := s1.fsm.State()
ws := memdb.NewWatchSet()
eval, err := state.EvalByID(ws, resp2.EvalID)
if err != nil {
t.Fatalf("err: %v", err)
}
if eval == nil {
t.Fatalf("expected eval")
}
if eval.CreateIndex != resp2.EvalCreateIndex {
t.Fatalf("index mis-match")
}
if eval.Priority != structs.JobDefaultPriority {
t.Fatalf("bad: %#v", eval)
}
if eval.Type != structs.JobTypeService {
t.Fatalf("bad: %#v", eval)
}
if eval.TriggeredBy != structs.EvalTriggerJobDeregister {
t.Fatalf("bad: %#v", eval)
}
if eval.JobID != jobID {
t.Fatalf("bad: %#v", eval)
}
if eval.JobModifyIndex != resp2.JobModifyIndex {
t.Fatalf("bad: %#v", eval)
}
if eval.Status != structs.EvalStatusPending {
t.Fatalf("bad: %#v", eval)
}
if eval.CreateTime == 0 {
t.Fatalf("eval CreateTime is unset: %#v", eval)
}
if eval.ModifyTime == 0 {
t.Fatalf("eval ModifyTime is unset: %#v", eval)
}
eval, err := state.EvalsByJob(ws, structs.DefaultNamespace, jobID)
must.NoError(t, err)
must.Nil(t, eval)
}
func TestJobEndpoint_Deregister_EvalPriority(t *testing.T) {

View File

@ -69,13 +69,10 @@ func (k *Keyring) Rotate(args *structs.KeyringRotateRootKeyRequest, reply *struc
Rekey: args.Full,
WriteRequest: args.WriteRequest,
}
out, index, err := k.srv.raftApply(structs.RootKeyMetaUpsertRequestType, req)
_, index, err := k.srv.raftApply(structs.RootKeyMetaUpsertRequestType, req)
if err != nil {
return err
}
if err, ok := out.(error); ok && err != nil {
return err
}
reply.Key = rootKey.Meta
reply.Index = index
@ -197,13 +194,10 @@ func (k *Keyring) Update(args *structs.KeyringUpdateRootKeyRequest, reply *struc
}
// update the metadata via Raft
out, index, err := k.srv.raftApply(structs.RootKeyMetaUpsertRequestType, metaReq)
_, index, err := k.srv.raftApply(structs.RootKeyMetaUpsertRequestType, metaReq)
if err != nil {
return err
}
if err, ok := out.(error); ok && err != nil {
return err
}
reply.Index = index
return nil
@ -352,13 +346,10 @@ func (k *Keyring) Delete(args *structs.KeyringDeleteRootKeyRequest, reply *struc
}
// update via Raft
out, index, err := k.srv.raftApply(structs.RootKeyMetaDeleteRequestType, args)
_, index, err := k.srv.raftApply(structs.RootKeyMetaDeleteRequestType, args)
if err != nil {
return err
}
if err, ok := out.(error); ok && err != nil {
return err
}
// remove the key from the keyring too
k.encrypter.RemoveKey(args.KeyID)

View File

@ -60,16 +60,11 @@ func (n *Namespace) UpsertNamespaces(args *structs.NamespaceUpsertRequest,
}
// Update via Raft
out, index, err := n.srv.raftApply(structs.NamespaceUpsertRequestType, args)
_, index, err := n.srv.raftApply(structs.NamespaceUpsertRequestType, args)
if err != nil {
return err
}
// Check if there was an error when applying.
if err, ok := out.(error); ok && err != nil {
return err
}
// Update the index
reply.Index = index
return nil
@ -124,16 +119,11 @@ func (n *Namespace) DeleteNamespaces(args *structs.NamespaceDeleteRequest, reply
}
// Update via Raft
out, index, err := n.srv.raftApply(structs.NamespaceDeleteRequestType, args)
_, index, err := n.srv.raftApply(structs.NamespaceDeleteRequestType, args)
if err != nil {
return err
}
// Check if there was an error when applying.
if err, ok := out.(error); ok && err != nil {
return err
}
// Update the index
reply.Index = index
return nil

View File

@ -292,9 +292,6 @@ func (op *Operator) AutopilotSetConfiguration(args *structs.AutopilotSetConfigRe
op.logger.Error("failed applying AutoPilot configuration", "error", err)
return err
}
if respErr, ok := resp.(error); ok {
return respErr
}
// Check if the return type is a bool.
if respBool, ok := resp.(bool); ok {
@ -371,9 +368,8 @@ func (op *Operator) SchedulerSetConfiguration(args *structs.SchedulerSetConfigRe
if err != nil {
op.logger.Error("failed applying Scheduler configuration", "error", err)
return err
} else if respErr, ok := resp.(error); ok {
return respErr
}
// If CAS request, raft returns a boolean indicating if the update was applied.
// Otherwise, assume success
reply.Updated = true

View File

@ -68,10 +68,7 @@ func (s *Server) DispatchJob(job *structs.Job) (*structs.Evaluation, error) {
Namespace: job.Namespace,
},
}
fsmErr, index, err := s.raftApply(structs.JobRegisterRequestType, req)
if err, ok := fsmErr.(error); ok && err != nil {
return nil, err
}
_, index, err := s.raftApply(structs.JobRegisterRequestType, req)
if err != nil {
return nil, err
}

View File

@ -765,9 +765,10 @@ func (s *Server) raftApplyFuture(t structs.MessageType, msg interface{}) (raft.A
// raftApplyFn is the function signature for applying a msg to Raft
type raftApplyFn func(t structs.MessageType, msg interface{}) (interface{}, uint64, error)
// raftApply is used to encode a message, run it through raft, and return
// the FSM response along with any errors
func (s *Server) raftApply(t structs.MessageType, msg interface{}) (interface{}, uint64, error) {
// raftApply is used to encode a message, run it through raft, and return the
// FSM response along with any errors. If the FSM.Apply response is an error it
// will be returned as the error return value with a nil response.
func (s *Server) raftApply(t structs.MessageType, msg any) (any, uint64, error) {
future, err := s.raftApplyFuture(t, msg)
if err != nil {
return nil, 0, err
@ -775,7 +776,11 @@ func (s *Server) raftApply(t structs.MessageType, msg interface{}) (interface{},
if err := future.Error(); err != nil {
return nil, 0, err
}
return future.Response(), future.Index(), nil
resp := future.Response()
if err, ok := resp.(error); ok && err != nil {
return nil, future.Index(), err
}
return resp, future.Index(), nil
}
// setQueryMeta is used to populate the QueryMeta data for an RPC call

View File

@ -86,16 +86,11 @@ func (s *ServiceRegistration) Upsert(
}
// Update via Raft.
out, index, err := s.srv.raftApply(structs.ServiceRegistrationUpsertRequestType, args)
_, index, err := s.srv.raftApply(structs.ServiceRegistrationUpsertRequestType, args)
if err != nil {
return err
}
// Check if the FSM response, which is an interface, contains an error.
if err, ok := out.(error); ok && err != nil {
return err
}
// Update the index. There is no need to floor this as we are writing to
// state and therefore will get a non-zero index response.
reply.Index = index
@ -164,16 +159,11 @@ func (s *ServiceRegistration) DeleteByID(
}
// Update via Raft.
out, index, err := s.srv.raftApply(structs.ServiceRegistrationDeleteByIDRequestType, args)
_, index, err := s.srv.raftApply(structs.ServiceRegistrationDeleteByIDRequestType, args)
if err != nil {
return err
}
// Check if the FSM response, which is an interface, contains an error.
if err, ok := out.(error); ok && err != nil {
return err
}
// Update the index. There is no need to floor this as we are writing to
// state and therefore will get a non-zero index response.
reply.Index = index