// Copyright (c) HashiCorp, Inc. // SPDX-License-Identifier: MPL-2.0 package nomad import ( "fmt" "net/http" "strings" "time" "github.com/armon/go-metrics" "github.com/hashicorp/go-hclog" "github.com/hashicorp/go-memdb" "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/acl" cstructs "github.com/hashicorp/nomad/client/structs" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/state/paginator" "github.com/hashicorp/nomad/nomad/structs" ) // CSIVolume wraps the structs.CSIVolume with request data and server context type CSIVolume struct { srv *Server ctx *RPCContext logger hclog.Logger } func NewCSIVolumeEndpoint(srv *Server, ctx *RPCContext) *CSIVolume { return &CSIVolume{srv: srv, ctx: ctx, logger: srv.logger.Named("csi_volume")} } const ( csiVolumeTable = "csi_volumes" csiPluginTable = "csi_plugins" ) // replySetIndex sets the reply with the last index that modified the table func (s *Server) replySetIndex(table string, reply *structs.QueryMeta) error { fmsState := s.fsm.State() index, err := fmsState.Index(table) if err != nil { return err } reply.Index = index // Set the query response s.setQueryMeta(reply) return nil } // List replies with CSIVolumes, filtered by ACL access func (v *CSIVolume) List(args *structs.CSIVolumeListRequest, reply *structs.CSIVolumeListResponse) error { authErr := v.srv.Authenticate(v.ctx, args) if done, err := v.srv.forward("CSIVolume.List", args, args, reply); done { return err } v.srv.MeasureRPCRate("csi_volume", structs.RateMetricList, args) if authErr != nil { return structs.ErrPermissionDenied } allowVolume := acl.NamespaceValidator(acl.NamespaceCapabilityCSIListVolume, acl.NamespaceCapabilityCSIReadVolume, acl.NamespaceCapabilityCSIMountVolume, acl.NamespaceCapabilityListJobs) aclObj, err := v.srv.ResolveACL(args) if err != nil { return err } if !allowVolume(aclObj, args.RequestNamespace()) { return structs.ErrPermissionDenied } defer metrics.MeasureSince([]string{"nomad", "volume", "list"}, time.Now()) ns := args.RequestNamespace() opts := blockingOptions{ queryOpts: &args.QueryOptions, queryMeta: &reply.QueryMeta, run: func(ws memdb.WatchSet, state *state.StateStore) error { snap, err := state.Snapshot() if err != nil { return err } // Query all volumes var iter memdb.ResultIterator prefix := args.Prefix if args.NodeID != "" { iter, err = snap.CSIVolumesByNodeID(ws, prefix, args.NodeID) } else if args.PluginID != "" { iter, err = snap.CSIVolumesByPluginID(ws, ns, prefix, args.PluginID) } else if prefix != "" { iter, err = snap.CSIVolumesByIDPrefix(ws, ns, prefix) } else if ns != structs.AllNamespacesSentinel { iter, err = snap.CSIVolumesByNamespace(ws, ns, prefix) } else { iter, err = snap.CSIVolumes(ws) } if err != nil { return err } tokenizer := paginator.NewStructsTokenizer( iter, paginator.StructsTokenizerOptions{ WithNamespace: true, WithID: true, }, ) volFilter := paginator.GenericFilter{ Allow: func(raw interface{}) (bool, error) { vol := raw.(*structs.CSIVolume) // Remove (possibly again) by PluginID to handle passing both // NodeID and PluginID if args.PluginID != "" && args.PluginID != vol.PluginID { return false, nil } // Remove by Namespace, since CSIVolumesByNodeID hasn't used // the Namespace yet if ns != structs.AllNamespacesSentinel && vol.Namespace != ns { return false, nil } return true, nil }, } filters := []paginator.Filter{volFilter} // Collect results, filter by ACL access vs := []*structs.CSIVolListStub{} paginator, err := paginator.NewPaginator(iter, tokenizer, filters, args.QueryOptions, func(raw interface{}) error { vol := raw.(*structs.CSIVolume) vol, err := snap.CSIVolumeDenormalizePlugins(ws, vol.Copy()) if err != nil { return err } vs = append(vs, vol.Stub()) return nil }) if err != nil { return structs.NewErrRPCCodedf( http.StatusBadRequest, "failed to create result paginator: %v", err) } nextToken, err := paginator.Page() if err != nil { return structs.NewErrRPCCodedf( http.StatusBadRequest, "failed to read result page: %v", err) } reply.QueryMeta.NextToken = nextToken reply.Volumes = vs return v.srv.replySetIndex(csiVolumeTable, &reply.QueryMeta) }} return v.srv.blockingRPC(&opts) } // Get fetches detailed information about a specific volume func (v *CSIVolume) Get(args *structs.CSIVolumeGetRequest, reply *structs.CSIVolumeGetResponse) error { authErr := v.srv.Authenticate(v.ctx, args) if done, err := v.srv.forward("CSIVolume.Get", args, args, reply); done { return err } v.srv.MeasureRPCRate("csi_volume", structs.RateMetricRead, args) if authErr != nil { return structs.ErrPermissionDenied } allowCSIAccess := acl.NamespaceValidator(acl.NamespaceCapabilityCSIReadVolume, acl.NamespaceCapabilityCSIMountVolume, acl.NamespaceCapabilityReadJob) aclObj, err := v.srv.ResolveClientOrACL(args) if err != nil { return err } ns := args.RequestNamespace() if !allowCSIAccess(aclObj, ns) { return structs.ErrPermissionDenied } defer metrics.MeasureSince([]string{"nomad", "volume", "get"}, time.Now()) if args.ID == "" { return fmt.Errorf("missing volume ID") } opts := blockingOptions{ queryOpts: &args.QueryOptions, queryMeta: &reply.QueryMeta, run: func(ws memdb.WatchSet, state *state.StateStore) error { snap, err := state.Snapshot() if err != nil { return err } vol, err := snap.CSIVolumeByID(ws, ns, args.ID) if err != nil { return err } if vol != nil { vol, err = snap.CSIVolumeDenormalize(ws, vol) } if err != nil { return err } reply.Volume = vol return v.srv.replySetIndex(csiVolumeTable, &reply.QueryMeta) }} return v.srv.blockingRPC(&opts) } func (v *CSIVolume) pluginValidateVolume(req *structs.CSIVolumeRegisterRequest, vol *structs.CSIVolume) (*structs.CSIPlugin, error) { state := v.srv.fsm.State() plugin, err := state.CSIPluginByID(nil, vol.PluginID) if err != nil { return nil, err } if plugin == nil { return nil, fmt.Errorf("no CSI plugin named: %s could be found", vol.PluginID) } vol.Provider = plugin.Provider vol.ProviderVersion = plugin.Version return plugin, nil } func (v *CSIVolume) controllerValidateVolume(req *structs.CSIVolumeRegisterRequest, vol *structs.CSIVolume, plugin *structs.CSIPlugin) error { if !plugin.ControllerRequired { // The plugin does not require a controller, so for now we won't do any // further validation of the volume. return nil } method := "ClientCSI.ControllerValidateVolume" cReq := &cstructs.ClientCSIControllerValidateVolumeRequest{ VolumeID: vol.RemoteID(), VolumeCapabilities: vol.RequestedCapabilities, Secrets: vol.Secrets, Parameters: vol.Parameters, Context: vol.Context, } cReq.PluginID = plugin.ID cResp := &cstructs.ClientCSIControllerValidateVolumeResponse{} return v.srv.RPC(method, cReq, cResp) } // Register registers a new volume or updates an existing volume. Note // that most user-defined CSIVolume fields are immutable once the // volume has been created. // // If the user needs to change fields because they've misconfigured // the registration of the external volume, we expect that claims // won't work either, and the user can deregister the volume and try // again with the right settings. This lets us be as strict with // validation here as the CreateVolume CSI RPC is expected to be. func (v *CSIVolume) Register(args *structs.CSIVolumeRegisterRequest, reply *structs.CSIVolumeRegisterResponse) error { authErr := v.srv.Authenticate(v.ctx, args) if done, err := v.srv.forward("CSIVolume.Register", args, args, reply); done { return err } v.srv.MeasureRPCRate("csi_volume", structs.RateMetricWrite, args) if authErr != nil { return structs.ErrPermissionDenied } allowVolume := acl.NamespaceValidator(acl.NamespaceCapabilityCSIWriteVolume) aclObj, err := v.srv.ResolveACL(args) if err != nil { return err } defer metrics.MeasureSince([]string{"nomad", "volume", "register"}, time.Now()) if !allowVolume(aclObj, args.RequestNamespace()) || !aclObj.AllowPluginRead() { return structs.ErrPermissionDenied } if len(args.Volumes) == 0 { return fmt.Errorf("missing volume definition") } // This is the only namespace we ACL checked, force all the volumes to use it. // We also validate that the plugin exists for each plugin, and validate the // capabilities when the plugin has a controller. for _, vol := range args.Volumes { snap, err := v.srv.State().Snapshot() if err != nil { return err } if vol.Namespace == "" { vol.Namespace = args.RequestNamespace() } if err = vol.Validate(); err != nil { return err } ws := memdb.NewWatchSet() existingVol, err := snap.CSIVolumeByID(ws, vol.Namespace, vol.ID) if err != nil { return err } // CSIVolume has many user-defined fields which are immutable // once set, and many fields that are controlled by Nomad and // are not user-settable. We merge onto a copy of the existing // volume to allow a user to submit a volume spec for `volume // create` and reuse it for updates in `volume register` // without having to manually remove the fields unused by // register (and similar use cases with API consumers such as // Terraform). if existingVol != nil { existingVol = existingVol.Copy() err = existingVol.Merge(vol) if err != nil { return err } *vol = *existingVol } else if vol.Topologies == nil || len(vol.Topologies) == 0 { // The topologies for the volume have already been set // when it was created, so for newly register volumes // we accept the user's description of that topology if vol.RequestedTopologies != nil { vol.Topologies = vol.RequestedTopologies.Required } } plugin, err := v.pluginValidateVolume(args, vol) if err != nil { return err } if err := v.controllerValidateVolume(args, vol, plugin); err != nil { return err } } _, index, err := v.srv.raftApply(structs.CSIVolumeRegisterRequestType, args) if err != nil { v.logger.Error("csi raft apply failed", "error", err, "method", "register") return err } reply.Index = index v.srv.setQueryMeta(&reply.QueryMeta) return nil } // Deregister removes a set of volumes func (v *CSIVolume) Deregister(args *structs.CSIVolumeDeregisterRequest, reply *structs.CSIVolumeDeregisterResponse) error { authErr := v.srv.Authenticate(v.ctx, args) if done, err := v.srv.forward("CSIVolume.Deregister", args, args, reply); done { return err } v.srv.MeasureRPCRate("csi_volume", structs.RateMetricWrite, args) if authErr != nil { return structs.ErrPermissionDenied } allowVolume := acl.NamespaceValidator(acl.NamespaceCapabilityCSIWriteVolume) aclObj, err := v.srv.ResolveACL(args) if err != nil { return err } defer metrics.MeasureSince([]string{"nomad", "volume", "deregister"}, time.Now()) ns := args.RequestNamespace() if !allowVolume(aclObj, ns) { return structs.ErrPermissionDenied } if len(args.VolumeIDs) == 0 { return fmt.Errorf("missing volume IDs") } _, index, err := v.srv.raftApply(structs.CSIVolumeDeregisterRequestType, args) if err != nil { v.logger.Error("csi raft apply failed", "error", err, "method", "deregister") return err } reply.Index = index v.srv.setQueryMeta(&reply.QueryMeta) return nil } // Claim submits a change to a volume claim func (v *CSIVolume) Claim(args *structs.CSIVolumeClaimRequest, reply *structs.CSIVolumeClaimResponse) error { authErr := v.srv.Authenticate(v.ctx, args) if done, err := v.srv.forward("CSIVolume.Claim", args, args, reply); done { return err } v.srv.MeasureRPCRate("csi_volume", structs.RateMetricWrite, args) if authErr != nil { return structs.ErrPermissionDenied } allowVolume := acl.NamespaceValidator(acl.NamespaceCapabilityCSIMountVolume) aclObj, err := v.srv.ResolveClientOrACL(args) if err != nil { return err } defer metrics.MeasureSince([]string{"nomad", "volume", "claim"}, time.Now()) if !allowVolume(aclObj, args.RequestNamespace()) || !aclObj.AllowPluginRead() { return structs.ErrPermissionDenied } if args.VolumeID == "" { return fmt.Errorf("missing volume ID") } isNewClaim := args.Claim != structs.CSIVolumeClaimGC && args.State == structs.CSIVolumeClaimStateTaken // COMPAT(1.0): the NodeID field was added after 0.11.0 and so we // need to ensure it's been populated during upgrades from 0.11.0 // to later patch versions. Remove this block in 1.0 if isNewClaim && args.NodeID == "" { state := v.srv.fsm.State() ws := memdb.NewWatchSet() alloc, err := state.AllocByID(ws, args.AllocationID) if err != nil { return err } if alloc == nil { return fmt.Errorf("%s: %s", structs.ErrUnknownAllocationPrefix, args.AllocationID) } args.NodeID = alloc.NodeID } _, index, err := v.srv.raftApply(structs.CSIVolumeClaimRequestType, args) if err != nil { v.logger.Error("csi raft apply failed", "error", err, "method", "claim") return err } if isNewClaim { // if this is a new claim, add a Volume and PublishContext from the // controller (if any) to the reply err = v.controllerPublishVolume(args, reply) if err != nil { return fmt.Errorf("controller publish: %v", err) } } reply.Index = index v.srv.setQueryMeta(&reply.QueryMeta) return nil } func csiVolumeMountOptions(c *structs.CSIMountOptions) *cstructs.CSIVolumeMountOptions { if c == nil { return nil } return &cstructs.CSIVolumeMountOptions{ Filesystem: c.FSType, MountFlags: c.MountFlags, } } // controllerPublishVolume sends publish request to the CSI controller // plugin associated with a volume, if any. func (v *CSIVolume) controllerPublishVolume(req *structs.CSIVolumeClaimRequest, resp *structs.CSIVolumeClaimResponse) error { plug, vol, err := v.volAndPluginLookup(req.RequestNamespace(), req.VolumeID) if err != nil { return err } // Set the Response volume from the lookup resp.Volume = vol // Validate the existence of the allocation, regardless of whether we need it // now. state := v.srv.fsm.State() ws := memdb.NewWatchSet() alloc, err := state.AllocByID(ws, req.AllocationID) if err != nil { return err } if alloc == nil { return fmt.Errorf("%s: %s", structs.ErrUnknownAllocationPrefix, req.AllocationID) } // Some plugins support controllers for create/snapshot but not attach. So // if there's no plugin or the plugin doesn't attach volumes, then we can // skip the controller publish workflow and return nil. if plug == nil || !plug.HasControllerCapability(structs.CSIControllerSupportsAttachDetach) { return nil } // get Nomad's ID for the client node (not the storage provider's ID) targetNode, err := state.NodeByID(ws, alloc.NodeID) if err != nil { return err } if targetNode == nil { return fmt.Errorf("%s: %s", structs.ErrUnknownNodePrefix, alloc.NodeID) } // if the RPC is sent by a client node, it may not know the claim's // external node ID. if req.ExternalNodeID == "" { externalNodeID, err := v.lookupExternalNodeID(vol, req.ToClaim()) if err != nil { return fmt.Errorf("missing external node ID: %v", err) } req.ExternalNodeID = externalNodeID } method := "ClientCSI.ControllerAttachVolume" cReq := &cstructs.ClientCSIControllerAttachVolumeRequest{ VolumeID: vol.RemoteID(), ClientCSINodeID: req.ExternalNodeID, AttachmentMode: req.AttachmentMode, AccessMode: req.AccessMode, MountOptions: csiVolumeMountOptions(vol.MountOptions), ReadOnly: req.Claim == structs.CSIVolumeClaimRead, Secrets: vol.Secrets, VolumeContext: vol.Context, } cReq.PluginID = plug.ID cResp := &cstructs.ClientCSIControllerAttachVolumeResponse{} err = v.srv.RPC(method, cReq, cResp) if err != nil { if strings.Contains(err.Error(), "FailedPrecondition") { return fmt.Errorf("%v: %v", structs.ErrCSIClientRPCRetryable, err) } return err } resp.PublishContext = cResp.PublishContext return nil } func (v *CSIVolume) volAndPluginLookup(namespace, volID string) (*structs.CSIPlugin, *structs.CSIVolume, error) { state := v.srv.fsm.State() vol, err := state.CSIVolumeByID(nil, namespace, volID) if err != nil { return nil, nil, err } if vol == nil { return nil, nil, fmt.Errorf("volume not found: %s", volID) } if !vol.ControllerRequired { return nil, vol, nil } // note: we do this same lookup in CSIVolumeByID but then throw // away the pointer to the plugin rather than attaching it to // the volume so we have to do it again here. plug, err := state.CSIPluginByID(nil, vol.PluginID) if err != nil { return nil, nil, err } if plug == nil { return nil, nil, fmt.Errorf("plugin not found: %s", vol.PluginID) } return plug, vol, nil } // allowCSIMount is called on Job register to check mount permission func allowCSIMount(aclObj *acl.ACL, namespace string) bool { return aclObj.AllowPluginRead() && aclObj.AllowNsOp(namespace, acl.NamespaceCapabilityCSIMountVolume) } // Unpublish synchronously sends the NodeUnpublish, NodeUnstage, and // ControllerUnpublish RPCs to the client. It handles errors according to the // current claim state. func (v *CSIVolume) Unpublish(args *structs.CSIVolumeUnpublishRequest, reply *structs.CSIVolumeUnpublishResponse) error { authErr := v.srv.Authenticate(v.ctx, args) if done, err := v.srv.forward("CSIVolume.Unpublish", args, args, reply); done { return err } v.srv.MeasureRPCRate("csi_volume", structs.RateMetricWrite, args) if authErr != nil { return structs.ErrPermissionDenied } defer metrics.MeasureSince([]string{"nomad", "volume", "unpublish"}, time.Now()) allowVolume := acl.NamespaceValidator(acl.NamespaceCapabilityCSIMountVolume) aclObj, err := v.srv.ResolveClientOrACL(args) if err != nil { return err } if !allowVolume(aclObj, args.RequestNamespace()) || !aclObj.AllowPluginRead() { return structs.ErrPermissionDenied } if args.VolumeID == "" { return fmt.Errorf("missing volume ID") } if args.Claim == nil { return fmt.Errorf("missing volume claim") } ws := memdb.NewWatchSet() state := v.srv.fsm.State() vol, err := state.CSIVolumeByID(ws, args.Namespace, args.VolumeID) if err != nil { return err } if vol == nil { return fmt.Errorf("no such volume") } claim := args.Claim // we need to checkpoint when we first get the claim to ensure we've set the // initial "past claim" state, otherwise a client that unpublishes (skipping // the node unpublish b/c it's done that work) fail to get written if the // controller unpublish fails. vol = vol.Copy() err = v.checkpointClaim(vol, claim) if err != nil { return err } // previous checkpoints may have set the past claim state already. // in practice we should never see CSIVolumeClaimStateControllerDetached // but having an option for the state makes it easy to add a checkpoint // in a backwards compatible way if we need one later switch claim.State { case structs.CSIVolumeClaimStateNodeDetached: goto NODE_DETACHED case structs.CSIVolumeClaimStateControllerDetached: goto RELEASE_CLAIM case structs.CSIVolumeClaimStateReadyToFree: goto RELEASE_CLAIM } vol = vol.Copy() err = v.nodeUnpublishVolume(vol, claim) if err != nil { return err } NODE_DETACHED: vol = vol.Copy() err = v.controllerUnpublishVolume(vol, claim) if err != nil { return err } RELEASE_CLAIM: v.logger.Trace("releasing claim", "vol", vol.ID) // advance a CSIVolumeClaimStateControllerDetached claim claim.State = structs.CSIVolumeClaimStateReadyToFree err = v.checkpointClaim(vol, claim) if err != nil { return err } reply.Index = vol.ModifyIndex v.srv.setQueryMeta(&reply.QueryMeta) return nil } // nodeUnpublishVolume handles the sending RPCs to the Node plugin to unmount // it. Typically this task is already completed on the client, but we need to // have this here so that GC can re-send it in case of client-side // problems. This function should only be called on a copy of the volume. func (v *CSIVolume) nodeUnpublishVolume(vol *structs.CSIVolume, claim *structs.CSIVolumeClaim) error { v.logger.Trace("node unpublish", "vol", vol.ID) // We need a new snapshot after each checkpoint snap, err := v.srv.fsm.State().Snapshot() if err != nil { return err } // If the node has been GC'd or is down, we can't send it a node // unpublish. We need to assume the node has unpublished at its // end. If it hasn't, any controller unpublish will potentially // hang or error and need to be retried. if claim.NodeID != "" { node, err := snap.NodeByID(memdb.NewWatchSet(), claim.NodeID) if err != nil { return err } if node == nil || node.Status == structs.NodeStatusDown { v.logger.Debug("skipping node unpublish for down or GC'd node") claim.State = structs.CSIVolumeClaimStateNodeDetached return v.checkpointClaim(vol, claim) } } if claim.AllocationID != "" { err := v.nodeUnpublishVolumeImpl(vol, claim) if err != nil { return err } claim.State = structs.CSIVolumeClaimStateNodeDetached return v.checkpointClaim(vol, claim) } // The RPC sent from the 'nomad node detach' command or GC won't have an // allocation ID set so we try to unpublish every terminal or invalid // alloc on the node, all of which will be in PastClaims after denormalizing vol, err = snap.CSIVolumeDenormalize(memdb.NewWatchSet(), vol) if err != nil { return err } claimsToUnpublish := []*structs.CSIVolumeClaim{} for _, pastClaim := range vol.PastClaims { if claim.NodeID == pastClaim.NodeID { claimsToUnpublish = append(claimsToUnpublish, pastClaim) } } var merr multierror.Error for _, pastClaim := range claimsToUnpublish { err := v.nodeUnpublishVolumeImpl(vol, pastClaim) if err != nil { merr.Errors = append(merr.Errors, err) } } err = merr.ErrorOrNil() if err != nil { return err } claim.State = structs.CSIVolumeClaimStateNodeDetached return v.checkpointClaim(vol, claim) } func (v *CSIVolume) nodeUnpublishVolumeImpl(vol *structs.CSIVolume, claim *structs.CSIVolumeClaim) error { if claim.AccessMode == structs.CSIVolumeAccessModeUnknown { // claim has already been released client-side return nil } req := &cstructs.ClientCSINodeDetachVolumeRequest{ PluginID: vol.PluginID, VolumeID: vol.ID, ExternalID: vol.RemoteID(), AllocID: claim.AllocationID, NodeID: claim.NodeID, AttachmentMode: claim.AttachmentMode, AccessMode: claim.AccessMode, ReadOnly: claim.Mode == structs.CSIVolumeClaimRead, } err := v.srv.RPC("ClientCSI.NodeDetachVolume", req, &cstructs.ClientCSINodeDetachVolumeResponse{}) if err != nil { // we should only get this error if the Nomad node disconnects and // is garbage-collected, so at this point we don't have any reason // to operate as though the volume is attached to it. // note: errors.Is cannot be used because the RPC call breaks // error wrapping. if !strings.Contains(err.Error(), structs.ErrUnknownNode.Error()) { return fmt.Errorf("could not detach from node: %w", err) } } return nil } // controllerUnpublishVolume handles the sending RPCs to the Controller plugin // to unpublish the volume (detach it from its host). This function should only // be called on a copy of the volume. func (v *CSIVolume) controllerUnpublishVolume(vol *structs.CSIVolume, claim *structs.CSIVolumeClaim) error { v.logger.Trace("controller unpublish", "vol", vol.ID) if !vol.ControllerRequired { claim.State = structs.CSIVolumeClaimStateReadyToFree return nil } // We need a new snapshot after each checkpoint snap, err := v.srv.fsm.State().Snapshot() if err != nil { return err } ws := memdb.NewWatchSet() plugin, err := snap.CSIPluginByID(ws, vol.PluginID) if err != nil { return fmt.Errorf("could not query plugin: %v", err) } else if plugin == nil { return fmt.Errorf("no such plugin: %q", vol.PluginID) } if !plugin.HasControllerCapability(structs.CSIControllerSupportsAttachDetach) { claim.State = structs.CSIVolumeClaimStateReadyToFree return nil } vol, err = snap.CSIVolumeDenormalize(ws, vol) if err != nil { return err } // we only send a controller detach if a Nomad client no longer has any // claim to the volume, so we need to check the status of any other claimed // allocations shouldCancel := func(alloc *structs.Allocation) bool { if alloc != nil && alloc.ID != claim.AllocationID && alloc.NodeID == claim.NodeID && !alloc.TerminalStatus() { claim.State = structs.CSIVolumeClaimStateReadyToFree v.logger.Debug( "controller unpublish canceled: another non-terminal alloc is on this node", "vol", vol.ID, "alloc", alloc.ID) return true } return false } for _, alloc := range vol.ReadAllocs { if shouldCancel(alloc) { return nil } } for _, alloc := range vol.WriteAllocs { if shouldCancel(alloc) { return nil } } // if the RPC is sent by a client node, it may not know the claim's // external node ID. if claim.ExternalNodeID == "" { externalNodeID, err := v.lookupExternalNodeID(vol, claim) if err != nil { return fmt.Errorf("missing external node ID: %v", err) } claim.ExternalNodeID = externalNodeID } req := &cstructs.ClientCSIControllerDetachVolumeRequest{ VolumeID: vol.RemoteID(), ClientCSINodeID: claim.ExternalNodeID, Secrets: vol.Secrets, } req.PluginID = vol.PluginID err = v.srv.RPC("ClientCSI.ControllerDetachVolume", req, &cstructs.ClientCSIControllerDetachVolumeResponse{}) if err != nil { return fmt.Errorf("could not detach from controller: %v", err) } v.logger.Trace("controller detach complete", "vol", vol.ID) claim.State = structs.CSIVolumeClaimStateReadyToFree return v.checkpointClaim(vol, claim) } // lookupExternalNodeID gets the CSI plugin's ID for a node. we look it up in // the volume's claims first because it's possible the client has been stopped // and GC'd by this point, so looking there is the last resort. func (v *CSIVolume) lookupExternalNodeID(vol *structs.CSIVolume, claim *structs.CSIVolumeClaim) (string, error) { for _, rClaim := range vol.ReadClaims { if rClaim.NodeID == claim.NodeID && rClaim.ExternalNodeID != "" { return rClaim.ExternalNodeID, nil } } for _, wClaim := range vol.WriteClaims { if wClaim.NodeID == claim.NodeID && wClaim.ExternalNodeID != "" { return wClaim.ExternalNodeID, nil } } for _, pClaim := range vol.PastClaims { if pClaim.NodeID == claim.NodeID && pClaim.ExternalNodeID != "" { return pClaim.ExternalNodeID, nil } } // fallback to looking up the node plugin ws := memdb.NewWatchSet() state := v.srv.fsm.State() targetNode, err := state.NodeByID(ws, claim.NodeID) if err != nil { return "", err } if targetNode == nil { return "", fmt.Errorf("%s: %s", structs.ErrUnknownNodePrefix, claim.NodeID) } // get the the storage provider's ID for the client node (not // Nomad's ID for the node) targetCSIInfo, ok := targetNode.CSINodePlugins[vol.PluginID] if !ok || targetCSIInfo.NodeInfo == nil { return "", fmt.Errorf("failed to find storage provider info for client %q, node plugin %q is not running or has not fingerprinted on this client", targetNode.ID, vol.PluginID) } return targetCSIInfo.NodeInfo.ID, nil } func (v *CSIVolume) checkpointClaim(vol *structs.CSIVolume, claim *structs.CSIVolumeClaim) error { v.logger.Trace("checkpointing claim") req := structs.CSIVolumeClaimRequest{ VolumeID: vol.ID, AllocationID: claim.AllocationID, NodeID: claim.NodeID, Claim: claim.Mode, State: claim.State, WriteRequest: structs.WriteRequest{ Namespace: vol.Namespace, }, } _, index, err := v.srv.raftApply(structs.CSIVolumeClaimRequestType, req) if err != nil { v.logger.Error("csi raft apply failed", "error", err) return err } vol.ModifyIndex = index return nil } func (v *CSIVolume) Create(args *structs.CSIVolumeCreateRequest, reply *structs.CSIVolumeCreateResponse) error { authErr := v.srv.Authenticate(v.ctx, args) if done, err := v.srv.forward("CSIVolume.Create", args, args, reply); done { return err } v.srv.MeasureRPCRate("csi_volume", structs.RateMetricWrite, args) if authErr != nil { return structs.ErrPermissionDenied } defer metrics.MeasureSince([]string{"nomad", "volume", "create"}, time.Now()) allowVolume := acl.NamespaceValidator(acl.NamespaceCapabilityCSIWriteVolume) aclObj, err := v.srv.ResolveACL(args) if err != nil { return err } if !allowVolume(aclObj, args.RequestNamespace()) || !aclObj.AllowPluginRead() { return structs.ErrPermissionDenied } if len(args.Volumes) == 0 { return fmt.Errorf("missing volume definition") } regArgs := &structs.CSIVolumeRegisterRequest{WriteRequest: args.WriteRequest} type validated struct { vol *structs.CSIVolume plugin *structs.CSIPlugin } validatedVols := []validated{} // This is the only namespace we ACL checked, force all the volumes to use it. // We also validate that the plugin exists for each plugin, and validate the // capabilities when the plugin has a controller. for _, vol := range args.Volumes { if vol.Namespace == "" { vol.Namespace = args.RequestNamespace() } if err = vol.Validate(); err != nil { return err } plugin, err := v.pluginValidateVolume(regArgs, vol) if err != nil { return err } if !plugin.ControllerRequired { return fmt.Errorf("plugin has no controller") } if !plugin.HasControllerCapability(structs.CSIControllerSupportsCreateDelete) { return fmt.Errorf("plugin does not support creating volumes") } validatedVols = append(validatedVols, validated{vol, plugin}) } // Attempt to create all the validated volumes and write only successfully // created volumes to raft. And we'll report errors for any failed volumes // // NOTE: creating the volume in the external storage provider can't be // made atomic with the registration, and creating the volume provides // values we want to write on the CSIVolume in raft anyways. For now // we'll block the RPC on the external storage provider so that we can // easily return meaningful errors to the user, but in the future we // should consider creating registering first and creating a "volume // eval" that can do the plugin RPCs async. var mErr multierror.Error for _, valid := range validatedVols { err = v.createVolume(valid.vol, valid.plugin) if err != nil { multierror.Append(&mErr, err) } else { regArgs.Volumes = append(regArgs.Volumes, valid.vol) } } _, index, err := v.srv.raftApply(structs.CSIVolumeRegisterRequestType, regArgs) if err != nil { v.logger.Error("csi raft apply failed", "error", err, "method", "register") multierror.Append(&mErr, err) } err = mErr.ErrorOrNil() if err != nil { return err } reply.Volumes = regArgs.Volumes reply.Index = index v.srv.setQueryMeta(&reply.QueryMeta) return nil } func (v *CSIVolume) createVolume(vol *structs.CSIVolume, plugin *structs.CSIPlugin) error { method := "ClientCSI.ControllerCreateVolume" cReq := &cstructs.ClientCSIControllerCreateVolumeRequest{ Name: vol.Name, VolumeCapabilities: vol.RequestedCapabilities, MountOptions: vol.MountOptions, Parameters: vol.Parameters, Secrets: vol.Secrets, CapacityMin: vol.RequestedCapacityMin, CapacityMax: vol.RequestedCapacityMax, SnapshotID: vol.SnapshotID, CloneID: vol.CloneID, RequestedTopologies: vol.RequestedTopologies, } cReq.PluginID = plugin.ID cResp := &cstructs.ClientCSIControllerCreateVolumeResponse{} err := v.srv.RPC(method, cReq, cResp) if err != nil { return err } vol.ExternalID = cResp.ExternalVolumeID vol.Capacity = cResp.CapacityBytes vol.Context = cResp.VolumeContext vol.Topologies = cResp.Topologies return nil } func (v *CSIVolume) Delete(args *structs.CSIVolumeDeleteRequest, reply *structs.CSIVolumeDeleteResponse) error { authErr := v.srv.Authenticate(v.ctx, args) if done, err := v.srv.forward("CSIVolume.Delete", args, args, reply); done { return err } v.srv.MeasureRPCRate("csi_volume", structs.RateMetricWrite, args) if authErr != nil { return structs.ErrPermissionDenied } defer metrics.MeasureSince([]string{"nomad", "volume", "delete"}, time.Now()) allowVolume := acl.NamespaceValidator(acl.NamespaceCapabilityCSIWriteVolume) aclObj, err := v.srv.ResolveACL(args) if err != nil { return err } if !allowVolume(aclObj, args.RequestNamespace()) || !aclObj.AllowPluginRead() { return structs.ErrPermissionDenied } if len(args.VolumeIDs) == 0 { return fmt.Errorf("missing volume IDs") } for _, volID := range args.VolumeIDs { plugin, vol, err := v.volAndPluginLookup(args.Namespace, volID) if err != nil { if err == fmt.Errorf("volume not found: %s", volID) { v.logger.Warn("volume %q to be deleted was already deregistered") continue } else { return err } } // NOTE: deleting the volume in the external storage provider can't be // made atomic with deregistration. We can't delete a volume that's // not registered because we need to be able to lookup its plugin. err = v.deleteVolume(vol, plugin, args.Secrets) if err != nil { return err } } deregArgs := &structs.CSIVolumeDeregisterRequest{ VolumeIDs: args.VolumeIDs, WriteRequest: args.WriteRequest, } _, index, err := v.srv.raftApply(structs.CSIVolumeDeregisterRequestType, deregArgs) if err != nil { v.logger.Error("csi raft apply failed", "error", err, "method", "deregister") return err } reply.Index = index v.srv.setQueryMeta(&reply.QueryMeta) return nil } func (v *CSIVolume) deleteVolume(vol *structs.CSIVolume, plugin *structs.CSIPlugin, querySecrets structs.CSISecrets) error { // Combine volume and query secrets into one map. // Query secrets override any secrets stored with the volume. combinedSecrets := vol.Secrets for k, v := range querySecrets { combinedSecrets[k] = v } method := "ClientCSI.ControllerDeleteVolume" cReq := &cstructs.ClientCSIControllerDeleteVolumeRequest{ ExternalVolumeID: vol.ExternalID, Secrets: combinedSecrets, } cReq.PluginID = plugin.ID cResp := &cstructs.ClientCSIControllerDeleteVolumeResponse{} return v.srv.RPC(method, cReq, cResp) } func (v *CSIVolume) ListExternal(args *structs.CSIVolumeExternalListRequest, reply *structs.CSIVolumeExternalListResponse) error { authErr := v.srv.Authenticate(v.ctx, args) if done, err := v.srv.forward("CSIVolume.ListExternal", args, args, reply); done { return err } v.srv.MeasureRPCRate("csi_volume", structs.RateMetricList, args) if authErr != nil { return structs.ErrPermissionDenied } defer metrics.MeasureSince([]string{"nomad", "volume", "list_external"}, time.Now()) allowVolume := acl.NamespaceValidator(acl.NamespaceCapabilityCSIListVolume, acl.NamespaceCapabilityCSIReadVolume, acl.NamespaceCapabilityCSIMountVolume, acl.NamespaceCapabilityListJobs) aclObj, err := v.srv.ResolveACL(args) if err != nil { return err } // NOTE: this is the plugin's namespace, not the volume(s) because they // might not even be registered if !allowVolume(aclObj, args.RequestNamespace()) { return structs.ErrPermissionDenied } snap, err := v.srv.fsm.State().Snapshot() if err != nil { return err } plugin, err := snap.CSIPluginByID(nil, args.PluginID) if err != nil { return err } if plugin == nil { return fmt.Errorf("no such plugin") } if !plugin.HasControllerCapability(structs.CSIControllerSupportsListVolumes) { return fmt.Errorf("unimplemented for this plugin") } method := "ClientCSI.ControllerListVolumes" cReq := &cstructs.ClientCSIControllerListVolumesRequest{ MaxEntries: args.PerPage, StartingToken: args.NextToken, } cReq.PluginID = plugin.ID cResp := &cstructs.ClientCSIControllerListVolumesResponse{} err = v.srv.RPC(method, cReq, cResp) if err != nil { return err } if args.PerPage > 0 && args.PerPage < int32(len(cResp.Entries)) { // this should be done in the plugin already, but enforce it reply.Volumes = cResp.Entries[:args.PerPage] } else { reply.Volumes = cResp.Entries } reply.NextToken = cResp.NextToken return nil } func (v *CSIVolume) CreateSnapshot(args *structs.CSISnapshotCreateRequest, reply *structs.CSISnapshotCreateResponse) error { authErr := v.srv.Authenticate(v.ctx, args) if done, err := v.srv.forward("CSIVolume.CreateSnapshot", args, args, reply); done { return err } v.srv.MeasureRPCRate("csi_volume", structs.RateMetricWrite, args) if authErr != nil { return structs.ErrPermissionDenied } defer metrics.MeasureSince([]string{"nomad", "volume", "create_snapshot"}, time.Now()) allowVolume := acl.NamespaceValidator(acl.NamespaceCapabilityCSIWriteVolume) aclObj, err := v.srv.ResolveACL(args) if err != nil { return err } if !allowVolume(aclObj, args.RequestNamespace()) || !aclObj.AllowPluginRead() { return structs.ErrPermissionDenied } state, err := v.srv.fsm.State().Snapshot() if err != nil { return err } method := "ClientCSI.ControllerCreateSnapshot" var mErr multierror.Error for _, snap := range args.Snapshots { if snap == nil { // we intentionally don't multierror here because we're in a weird state return fmt.Errorf("snapshot cannot be nil") } vol, err := state.CSIVolumeByID(nil, args.RequestNamespace(), snap.SourceVolumeID) if err != nil { multierror.Append(&mErr, fmt.Errorf("error querying volume %q: %v", snap.SourceVolumeID, err)) continue } if vol == nil { multierror.Append(&mErr, fmt.Errorf("no such volume %q", snap.SourceVolumeID)) continue } pluginID := snap.PluginID if pluginID == "" { pluginID = vol.PluginID } plugin, err := state.CSIPluginByID(nil, pluginID) if err != nil { multierror.Append(&mErr, fmt.Errorf("error querying plugin %q: %v", pluginID, err)) continue } if plugin == nil { multierror.Append(&mErr, fmt.Errorf("no such plugin %q", pluginID)) continue } if !plugin.HasControllerCapability(structs.CSIControllerSupportsCreateDeleteSnapshot) { multierror.Append(&mErr, fmt.Errorf("plugin %q does not support snapshot", pluginID)) continue } secrets := vol.Secrets for k, v := range snap.Secrets { // merge request secrets onto volume secrets secrets[k] = v } cReq := &cstructs.ClientCSIControllerCreateSnapshotRequest{ ExternalSourceVolumeID: vol.ExternalID, Name: snap.Name, Secrets: secrets, Parameters: snap.Parameters, } cReq.PluginID = pluginID cResp := &cstructs.ClientCSIControllerCreateSnapshotResponse{} err = v.srv.RPC(method, cReq, cResp) if err != nil { multierror.Append(&mErr, fmt.Errorf("could not create snapshot: %v", err)) continue } reply.Snapshots = append(reply.Snapshots, &structs.CSISnapshot{ ID: cResp.ID, ExternalSourceVolumeID: cResp.ExternalSourceVolumeID, SizeBytes: cResp.SizeBytes, CreateTime: cResp.CreateTime, IsReady: cResp.IsReady, }) } return mErr.ErrorOrNil() } func (v *CSIVolume) DeleteSnapshot(args *structs.CSISnapshotDeleteRequest, reply *structs.CSISnapshotDeleteResponse) error { authErr := v.srv.Authenticate(v.ctx, args) if done, err := v.srv.forward("CSIVolume.DeleteSnapshot", args, args, reply); done { return err } v.srv.MeasureRPCRate("csi_volume", structs.RateMetricWrite, args) if authErr != nil { return structs.ErrPermissionDenied } defer metrics.MeasureSince([]string{"nomad", "volume", "delete_snapshot"}, time.Now()) allowVolume := acl.NamespaceValidator(acl.NamespaceCapabilityCSIWriteVolume) aclObj, err := v.srv.ResolveACL(args) if err != nil { return err } // NOTE: this is the plugin's namespace, not the snapshot(s) because we // don't track snapshots in the state store at all and their source // volume(s) because they might not even be registered if !allowVolume(aclObj, args.RequestNamespace()) || !aclObj.AllowPluginRead() { return structs.ErrPermissionDenied } stateSnap, err := v.srv.fsm.State().Snapshot() if err != nil { return err } var mErr multierror.Error for _, snap := range args.Snapshots { if snap == nil { // we intentionally don't multierror here because we're in a weird state return fmt.Errorf("snapshot cannot be nil") } plugin, err := stateSnap.CSIPluginByID(nil, snap.PluginID) if err != nil { multierror.Append(&mErr, fmt.Errorf("could not query plugin %q: %v", snap.PluginID, err)) continue } if plugin == nil { multierror.Append(&mErr, fmt.Errorf("no such plugin")) continue } if !plugin.HasControllerCapability(structs.CSIControllerSupportsCreateDeleteSnapshot) { multierror.Append(&mErr, fmt.Errorf("plugin does not support snapshot")) continue } method := "ClientCSI.ControllerDeleteSnapshot" cReq := &cstructs.ClientCSIControllerDeleteSnapshotRequest{ID: snap.ID} cReq.PluginID = plugin.ID cResp := &cstructs.ClientCSIControllerDeleteSnapshotResponse{} err = v.srv.RPC(method, cReq, cResp) if err != nil { multierror.Append(&mErr, fmt.Errorf("could not delete %q: %v", snap.ID, err)) } } return mErr.ErrorOrNil() } func (v *CSIVolume) ListSnapshots(args *structs.CSISnapshotListRequest, reply *structs.CSISnapshotListResponse) error { authErr := v.srv.Authenticate(v.ctx, args) if done, err := v.srv.forward("CSIVolume.ListSnapshots", args, args, reply); done { return err } v.srv.MeasureRPCRate("csi_volume", structs.RateMetricList, args) if authErr != nil { return structs.ErrPermissionDenied } defer metrics.MeasureSince([]string{"nomad", "volume", "list_snapshots"}, time.Now()) allowVolume := acl.NamespaceValidator(acl.NamespaceCapabilityCSIListVolume, acl.NamespaceCapabilityCSIReadVolume, acl.NamespaceCapabilityCSIMountVolume, acl.NamespaceCapabilityListJobs) aclObj, err := v.srv.ResolveACL(args) if err != nil { return err } // NOTE: this is the plugin's namespace, not the volume(s) because they // might not even be registered if !allowVolume(aclObj, args.RequestNamespace()) { return structs.ErrPermissionDenied } snap, err := v.srv.fsm.State().Snapshot() if err != nil { return err } plugin, err := snap.CSIPluginByID(nil, args.PluginID) if err != nil { return err } if plugin == nil { return fmt.Errorf("no such plugin") } if !plugin.HasControllerCapability(structs.CSIControllerSupportsListSnapshots) { return fmt.Errorf("plugin does not support listing snapshots") } method := "ClientCSI.ControllerListSnapshots" cReq := &cstructs.ClientCSIControllerListSnapshotsRequest{ MaxEntries: args.PerPage, StartingToken: args.NextToken, Secrets: args.Secrets, } cReq.PluginID = plugin.ID cResp := &cstructs.ClientCSIControllerListSnapshotsResponse{} err = v.srv.RPC(method, cReq, cResp) if err != nil { return err } if args.PerPage > 0 && args.PerPage < int32(len(cResp.Entries)) { // this should be done in the plugin already, but enforce it reply.Snapshots = cResp.Entries[:args.PerPage] } else { reply.Snapshots = cResp.Entries } reply.NextToken = cResp.NextToken return nil } // CSIPlugin wraps the structs.CSIPlugin with request data and server context type CSIPlugin struct { srv *Server ctx *RPCContext logger hclog.Logger } func NewCSIPluginEndpoint(srv *Server, ctx *RPCContext) *CSIPlugin { return &CSIPlugin{srv: srv, ctx: ctx, logger: srv.logger.Named("csi_plugin")} } // List replies with CSIPlugins, filtered by ACL access func (v *CSIPlugin) List(args *structs.CSIPluginListRequest, reply *structs.CSIPluginListResponse) error { authErr := v.srv.Authenticate(v.ctx, args) if done, err := v.srv.forward("CSIPlugin.List", args, args, reply); done { return err } v.srv.MeasureRPCRate("csi_plugin", structs.RateMetricList, args) if authErr != nil { return structs.ErrPermissionDenied } defer metrics.MeasureSince([]string{"nomad", "plugin", "list"}, time.Now()) aclObj, err := v.srv.ResolveACL(args) if err != nil { return err } if !aclObj.AllowPluginList() { return structs.ErrPermissionDenied } opts := blockingOptions{ queryOpts: &args.QueryOptions, queryMeta: &reply.QueryMeta, run: func(ws memdb.WatchSet, state *state.StateStore) error { var iter memdb.ResultIterator var err error if args.Prefix != "" { iter, err = state.CSIPluginsByIDPrefix(ws, args.Prefix) if err != nil { return err } } else { // Query all plugins iter, err = state.CSIPlugins(ws) if err != nil { return err } } // Collect results ps := []*structs.CSIPluginListStub{} for { raw := iter.Next() if raw == nil { break } plug := raw.(*structs.CSIPlugin) ps = append(ps, plug.Stub()) } reply.Plugins = ps return v.srv.replySetIndex(csiPluginTable, &reply.QueryMeta) }} return v.srv.blockingRPC(&opts) } // Get fetches detailed information about a specific plugin func (v *CSIPlugin) Get(args *structs.CSIPluginGetRequest, reply *structs.CSIPluginGetResponse) error { authErr := v.srv.Authenticate(v.ctx, args) if done, err := v.srv.forward("CSIPlugin.Get", args, args, reply); done { return err } v.srv.MeasureRPCRate("csi_plugin", structs.RateMetricRead, args) if authErr != nil { return structs.ErrPermissionDenied } defer metrics.MeasureSince([]string{"nomad", "plugin", "get"}, time.Now()) aclObj, err := v.srv.ResolveACL(args) if err != nil { return err } if !aclObj.AllowPluginRead() { return structs.ErrPermissionDenied } withAllocs := aclObj == nil || aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilityReadJob) if args.ID == "" { return fmt.Errorf("missing plugin ID") } opts := blockingOptions{ queryOpts: &args.QueryOptions, queryMeta: &reply.QueryMeta, run: func(ws memdb.WatchSet, state *state.StateStore) error { snap, err := state.Snapshot() if err != nil { return err } plug, err := snap.CSIPluginByID(ws, args.ID) if err != nil { return err } if plug == nil { return nil } if withAllocs { plug, err = snap.CSIPluginDenormalize(ws, plug.Copy()) if err != nil { return err } // Filter the allocation stubs by our namespace. withAllocs // means we're allowed var as []*structs.AllocListStub for _, a := range plug.Allocations { if a.Namespace == args.RequestNamespace() { as = append(as, a) } } plug.Allocations = as } reply.Plugin = plug return v.srv.replySetIndex(csiPluginTable, &reply.QueryMeta) }} return v.srv.blockingRPC(&opts) } // Delete deletes a plugin if it is unused func (v *CSIPlugin) Delete(args *structs.CSIPluginDeleteRequest, reply *structs.CSIPluginDeleteResponse) error { authErr := v.srv.Authenticate(v.ctx, args) if done, err := v.srv.forward("CSIPlugin.Delete", args, args, reply); done { return err } v.srv.MeasureRPCRate("csi_plugin", structs.RateMetricWrite, args) if authErr != nil { return structs.ErrPermissionDenied } defer metrics.MeasureSince([]string{"nomad", "plugin", "delete"}, time.Now()) // Check that it is a management token. if aclObj, err := v.srv.ResolveACL(args); err != nil { return err } else if aclObj != nil && !aclObj.IsManagement() { return structs.ErrPermissionDenied } if args.ID == "" { return fmt.Errorf("missing plugin ID") } _, index, err := v.srv.raftApply(structs.CSIPluginDeleteRequestType, args) if err != nil { v.logger.Error("csi raft apply failed", "error", err, "method", "delete") return err } reply.Index = index v.srv.setQueryMeta(&reply.QueryMeta) return nil }