CSI: enforce usage at claim time (#12112)

* Remove redundant schedulable check in `FreeWriteClaims`. If a volume
  has been created but not yet claimed, its capabilities will be checked
  in `WriteSchedulable` at both scheduling time and claim time. We don't
  need to also check them in the `FreeWriteClaims` method.

* Enforce maximum volume claims for writers.

  When the scheduler checks feasibility for CSI volumes, the check is
  fairly loose: earlier versions of the same job are not counted as
  active claims. This allows the scheduler to place new allocations
  for the new version of a job, under the assumption that we'll replace
  the existing allocations and their volume claims.

  But when the alloc runner claims the volume, we need to enforce the
  active claims even if they're for allocations of an earlier version of
  the job. Otherwise we'll try to mount a volume that's currently being
  unmounted, and this will cause replacement allocations to frequently
  fail.

* Enforce single-node reader check for read-only volumes. When the
  alloc runner makes a claim for a read-only volume, we only check that
  the volume is potentially schedulable and not that it actually has
  free read claims.
This commit is contained in:
Tim Gross 2022-02-24 09:37:37 -05:00 committed by GitHub
parent 42b338308f
commit cfe3117af8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 39 additions and 26 deletions

3
.changelog/12112.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
csi: Fixed a bug where the maximum number of volume claims was incorrectly enforced when an allocation claims a volume
```

View File

@ -243,9 +243,11 @@ func TestCSIVolumeEndpoint_Claim(t *testing.T) {
// Create an initial volume claim request; we expect it to fail
// because there's no such volume yet.
claimReq := &structs.CSIVolumeClaimRequest{
VolumeID: id0,
AllocationID: alloc.ID,
Claim: structs.CSIVolumeClaimWrite,
VolumeID: id0,
AllocationID: alloc.ID,
Claim: structs.CSIVolumeClaimWrite,
AccessMode: structs.CSIVolumeAccessModeMultiNodeSingleWriter,
AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
WriteRequest: structs.WriteRequest{
Region: "global",
Namespace: structs.DefaultNamespace,

View File

@ -411,6 +411,26 @@ func (v *CSIVolume) WriteSchedulable() bool {
return false
}
// HasFreeReadClaims determines if there are any free read claims available
func (v *CSIVolume) HasFreeReadClaims() bool {
switch v.AccessMode {
case CSIVolumeAccessModeSingleNodeReader:
return len(v.ReadClaims) == 0
case CSIVolumeAccessModeSingleNodeWriter:
return len(v.ReadClaims) == 0 && len(v.WriteClaims) == 0
case CSIVolumeAccessModeUnknown:
// This volume was created but not yet claimed, so its
// capabilities have been checked in ReadSchedulable
return true
default:
// For multi-node AccessModes, the CSI spec doesn't allow for
// setting a max number of readers we track node resource
// exhaustion through v.ResourceExhausted which is checked in
// ReadSchedulable
return true
}
}
// HasFreeWriteClaims determines if there are any free write claims available
func (v *CSIVolume) HasFreeWriteClaims() bool {
switch v.AccessMode {
@ -422,24 +442,13 @@ func (v *CSIVolume) HasFreeWriteClaims() bool {
// which is checked in WriteSchedulable
return true
case CSIVolumeAccessModeUnknown:
// this volume was created but not yet claimed, so we check what it's
// capable of, not what it's been assigned
if len(v.RequestedCapabilities) == 0 {
// COMPAT: a volume that was registered before 1.1.0 and has not
// had a change in claims could have no requested caps. It will
// get corrected on the first claim.
return true
}
for _, cap := range v.RequestedCapabilities {
switch cap.AccessMode {
case CSIVolumeAccessModeSingleNodeWriter, CSIVolumeAccessModeMultiNodeSingleWriter:
return len(v.WriteClaims) == 0
case CSIVolumeAccessModeMultiNodeMultiWriter:
return true
}
}
// This volume was created but not yet claimed, so its
// capabilities have been checked in WriteSchedulable
return true
default:
// Reader modes never have free write claims
return false
}
return false
}
// InUse tests whether any allocations are actively using the volume
@ -538,6 +547,10 @@ func (v *CSIVolume) claimRead(claim *CSIVolumeClaim, alloc *Allocation) error {
return ErrCSIVolumeUnschedulable
}
if !v.HasFreeReadClaims() {
return ErrCSIVolumeMaxClaims
}
// Allocations are copy on write, so we want to keep the id but don't need the
// pointer. We'll get it from the db in denormalize.
v.ReadAllocs[claim.AllocationID] = nil
@ -565,12 +578,7 @@ func (v *CSIVolume) claimWrite(claim *CSIVolumeClaim, alloc *Allocation) error {
}
if !v.HasFreeWriteClaims() {
// Check the blocking allocations to see if they belong to this job
for _, a := range v.WriteAllocs {
if a != nil && (a.Namespace != alloc.Namespace || a.JobID != alloc.JobID) {
return ErrCSIVolumeMaxClaims
}
}
return ErrCSIVolumeMaxClaims
}
// Allocations are copy on write, so we want to keep the id but don't need the