CSI: skip node unpublish on GC'd or down nodes (#13301)
If the node has been GC'd or is down, we can't send it a node unpublish. The CSI spec requires that we don't send the controller unpublish before the node unpublish, but in the case where a node is gone we can't know the final fate of the node unpublish step. The `csi_hook` on the client will unpublish if the allocation has stopped and if the host is terminated there's no mount for the volume anyways. So we'll now assume that the node has unpublished at its end. If it hasn't, any controller unpublish will potentially hang or error and need to be retried.
This commit is contained in:
parent
94a78597d2
commit
9d5523a72d
|
@ -0,0 +1,3 @@
|
|||
```release-note:bug
|
||||
csi: Fixed a bug where volume claims on lost or garbage collected nodes could not be freed
|
||||
```
|
|
@ -686,6 +686,25 @@ RELEASE_CLAIM:
|
|||
|
||||
func (v *CSIVolume) nodeUnpublishVolume(vol *structs.CSIVolume, claim *structs.CSIVolumeClaim) error {
|
||||
v.logger.Trace("node unpublish", "vol", vol.ID)
|
||||
|
||||
store := v.srv.fsm.State()
|
||||
|
||||
// If the node has been GC'd or is down, we can't send it a node
|
||||
// unpublish. We need to assume the node has unpublished at its
|
||||
// end. If it hasn't, any controller unpublish will potentially
|
||||
// hang or error and need to be retried.
|
||||
if claim.NodeID != "" {
|
||||
node, err := store.NodeByID(memdb.NewWatchSet(), claim.NodeID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if node == nil || node.Status == structs.NodeStatusDown {
|
||||
v.logger.Debug("skipping node unpublish for down or GC'd node")
|
||||
claim.State = structs.CSIVolumeClaimStateNodeDetached
|
||||
return v.checkpointClaim(vol, claim)
|
||||
}
|
||||
}
|
||||
|
||||
if claim.AllocationID != "" {
|
||||
err := v.nodeUnpublishVolumeImpl(vol, claim)
|
||||
if err != nil {
|
||||
|
@ -698,8 +717,7 @@ func (v *CSIVolume) nodeUnpublishVolume(vol *structs.CSIVolume, claim *structs.C
|
|||
// The RPC sent from the 'nomad node detach' command or GC won't have an
|
||||
// allocation ID set so we try to unpublish every terminal or invalid
|
||||
// alloc on the node, all of which will be in PastClaims after denormalizing
|
||||
state := v.srv.fsm.State()
|
||||
vol, err := state.CSIVolumeDenormalize(memdb.NewWatchSet(), vol)
|
||||
vol, err := store.CSIVolumeDenormalize(memdb.NewWatchSet(), vol)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
|
@ -504,22 +504,32 @@ func TestCSIVolumeEndpoint_Unpublish(t *testing.T) {
|
|||
type tc struct {
|
||||
name string
|
||||
startingState structs.CSIVolumeClaimState
|
||||
nodeID string
|
||||
expectedErrMsg string
|
||||
}
|
||||
testCases := []tc{
|
||||
{
|
||||
name: "success",
|
||||
startingState: structs.CSIVolumeClaimStateControllerDetached,
|
||||
nodeID: node.ID,
|
||||
},
|
||||
{
|
||||
name: "unpublish previously detached node",
|
||||
startingState: structs.CSIVolumeClaimStateNodeDetached,
|
||||
expectedErrMsg: "could not detach from controller: controller detach volume: No path to node",
|
||||
nodeID: node.ID,
|
||||
},
|
||||
{
|
||||
name: "unpublish claim on garbage collected node",
|
||||
startingState: structs.CSIVolumeClaimStateTaken,
|
||||
expectedErrMsg: "could not detach from controller: controller detach volume: No path to node",
|
||||
nodeID: uuid.Generate(),
|
||||
},
|
||||
{
|
||||
name: "first unpublish",
|
||||
startingState: structs.CSIVolumeClaimStateTaken,
|
||||
expectedErrMsg: "could not detach from controller: controller detach volume: No path to node",
|
||||
nodeID: node.ID,
|
||||
},
|
||||
}
|
||||
|
||||
|
@ -545,7 +555,7 @@ func TestCSIVolumeEndpoint_Unpublish(t *testing.T) {
|
|||
|
||||
// setup: create an alloc that will claim our volume
|
||||
alloc := mock.BatchAlloc()
|
||||
alloc.NodeID = node.ID
|
||||
alloc.NodeID = tc.nodeID
|
||||
alloc.ClientStatus = structs.AllocClientStatusFailed
|
||||
|
||||
index++
|
||||
|
@ -554,7 +564,7 @@ func TestCSIVolumeEndpoint_Unpublish(t *testing.T) {
|
|||
// setup: claim the volume for our alloc
|
||||
claim := &structs.CSIVolumeClaim{
|
||||
AllocationID: alloc.ID,
|
||||
NodeID: node.ID,
|
||||
NodeID: tc.nodeID,
|
||||
ExternalNodeID: "i-example",
|
||||
Mode: structs.CSIVolumeClaimRead,
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue