open-nomad/client/allocrunner/csi_hook_test.go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0

package allocrunner

import (
	"context"
	"errors"
	"fmt"
	"path/filepath"
	"sync"
	"testing"
	"time"

	"github.com/hashicorp/nomad/ci"
	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
	"github.com/hashicorp/nomad/client/allocrunner/state"
	"github.com/hashicorp/nomad/client/pluginmanager"
	"github.com/hashicorp/nomad/client/pluginmanager/csimanager"
	cstructs "github.com/hashicorp/nomad/client/structs"
	"github.com/hashicorp/nomad/helper/pointer"
	"github.com/hashicorp/nomad/helper/testlog"
	"github.com/hashicorp/nomad/nomad/mock"
	"github.com/hashicorp/nomad/nomad/structs"
	"github.com/hashicorp/nomad/plugins/drivers"
	"github.com/shoenig/test/must"
	"github.com/stretchr/testify/require"
	"golang.org/x/exp/maps"
)

var _ interfaces.RunnerPrerunHook = (*csiHook)(nil)
var _ interfaces.RunnerPostrunHook = (*csiHook)(nil)

func TestCSIHook(t *testing.T) {
	ci.Parallel(t)

	alloc := mock.Alloc()
	testMountSrc := fmt.Sprintf(
		"test-alloc-dir/%s/testvolume0/ro-file-system-single-node-reader-only", alloc.ID)
	logger := testlog.HCLogger(t)

	testcases := []struct {
		name                  string
		volumeRequests        map[string]*structs.VolumeRequest
		startsUnschedulable   bool
		startsWithClaims      bool
		startsWithStubs       map[string]*state.CSIVolumeStub
		startsWithValidMounts bool
		failsFirstUnmount     bool
		expectedClaimErr      error
		expectedMounts        map[string]*csimanager.MountInfo
		expectedCalls         map[string]int
	}{

		{
			name: "simple case",
			volumeRequests: map[string]*structs.VolumeRequest{
				"vol0": {
					Name:           "vol0",
					Type:           structs.VolumeTypeCSI,
					Source:         "testvolume0",
					ReadOnly:       true,
					AccessMode:     structs.CSIVolumeAccessModeSingleNodeReader,
					AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
					MountOptions:   &structs.CSIMountOptions{},
					PerAlloc:       false,
				},
			},
			expectedMounts: map[string]*csimanager.MountInfo{
				"vol0": &csimanager.MountInfo{Source: testMountSrc},
			},
			expectedCalls: map[string]int{
				"claim": 1, "mount": 1, "unmount": 1, "unpublish": 1},
		},

		{
			name: "per-alloc case",
			volumeRequests: map[string]*structs.VolumeRequest{
				"vol0": {
					Name:           "vol0",
					Type:           structs.VolumeTypeCSI,
					Source:         "testvolume0",
					ReadOnly:       true,
					AccessMode:     structs.CSIVolumeAccessModeSingleNodeReader,
					AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
					MountOptions:   &structs.CSIMountOptions{},
					PerAlloc:       true,
				},
			},
			expectedMounts: map[string]*csimanager.MountInfo{
				"vol0": &csimanager.MountInfo{Source: testMountSrc},
			},
			expectedCalls: map[string]int{
				"claim": 1, "mount": 1, "unmount": 1, "unpublish": 1},
		},

		{
			name: "fatal error on claim",
			volumeRequests: map[string]*structs.VolumeRequest{
				"vol0": {
					Name:           "vol0",
					Type:           structs.VolumeTypeCSI,
					Source:         "testvolume0",
					ReadOnly:       true,
					AccessMode:     structs.CSIVolumeAccessModeSingleNodeReader,
					AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
					MountOptions:   &structs.CSIMountOptions{},
					PerAlloc:       false,
				},
			},
			startsUnschedulable: true,
			expectedMounts: map[string]*csimanager.MountInfo{
				"vol0": &csimanager.MountInfo{Source: testMountSrc},
			},
			expectedCalls: map[string]int{"claim": 1},
			expectedClaimErr: errors.New(
				"claiming volumes: could not claim volume testvolume0: volume is currently unschedulable"),
		},

		{
			name: "retryable error on claim",
			volumeRequests: map[string]*structs.VolumeRequest{
				"vol0": {
					Name:           "vol0",
					Type:           structs.VolumeTypeCSI,
					Source:         "testvolume0",
					ReadOnly:       true,
					AccessMode:     structs.CSIVolumeAccessModeSingleNodeReader,
					AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
					MountOptions:   &structs.CSIMountOptions{},
					PerAlloc:       false,
				},
			},
			startsWithClaims: true,
			expectedMounts: map[string]*csimanager.MountInfo{
				"vol0": &csimanager.MountInfo{Source: testMountSrc},
			},
			expectedCalls: map[string]int{
				"claim": 2, "mount": 1, "unmount": 1, "unpublish": 1},
		},
		{
			name: "already mounted",
			volumeRequests: map[string]*structs.VolumeRequest{
				"vol0": {
					Name:           "vol0",
					Type:           structs.VolumeTypeCSI,
					Source:         "testvolume0",
					ReadOnly:       true,
					AccessMode:     structs.CSIVolumeAccessModeSingleNodeReader,
					AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
					MountOptions:   &structs.CSIMountOptions{},
					PerAlloc:       false,
				},
			},
			startsWithStubs: map[string]*state.CSIVolumeStub{"vol0": {
				VolumeID:       "vol0",
				PluginID:       "vol0-plugin",
				ExternalNodeID: "i-example",
				MountInfo:      &csimanager.MountInfo{Source: testMountSrc},
			}},
			startsWithValidMounts: true,
			expectedMounts: map[string]*csimanager.MountInfo{
				"vol0": &csimanager.MountInfo{Source: testMountSrc},
			},
			expectedCalls: map[string]int{"hasMount": 1, "unmount": 1, "unpublish": 1},
		},
		{
			name: "existing but invalid mounts",
			volumeRequests: map[string]*structs.VolumeRequest{
				"vol0": {
					Name:           "vol0",
					Type:           structs.VolumeTypeCSI,
					Source:         "testvolume0",
					ReadOnly:       true,
					AccessMode:     structs.CSIVolumeAccessModeSingleNodeReader,
					AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
					MountOptions:   &structs.CSIMountOptions{},
					PerAlloc:       false,
				},
			},
			startsWithStubs: map[string]*state.CSIVolumeStub{"vol0": {
				VolumeID:       "testvolume0",
				PluginID:       "vol0-plugin",
				ExternalNodeID: "i-example",
				MountInfo:      &csimanager.MountInfo{Source: testMountSrc},
			}},
			startsWithValidMounts: false,
			expectedMounts: map[string]*csimanager.MountInfo{
				"vol0": &csimanager.MountInfo{Source: testMountSrc},
			},
			expectedCalls: map[string]int{
				"hasMount": 1, "claim": 1, "mount": 1, "unmount": 1, "unpublish": 1},
		},

		{
			name: "retry on failed unmount",
			volumeRequests: map[string]*structs.VolumeRequest{
				"vol0": {
					Name:           "vol0",
					Type:           structs.VolumeTypeCSI,
					Source:         "testvolume0",
					ReadOnly:       true,
					AccessMode:     structs.CSIVolumeAccessModeSingleNodeReader,
					AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
					MountOptions:   &structs.CSIMountOptions{},
					PerAlloc:       false,
				},
			},
			failsFirstUnmount: true,
			expectedMounts: map[string]*csimanager.MountInfo{
				"vol0": &csimanager.MountInfo{Source: testMountSrc},
			},
			expectedCalls: map[string]int{
				"claim": 1, "mount": 1, "unmount": 2, "unpublish": 2},
		},

		{
			name:           "should not run",
			volumeRequests: map[string]*structs.VolumeRequest{},
		},
	}

	for i := range testcases {
		tc := testcases[i]
		t.Run(tc.name, func(t *testing.T) {

			alloc.Job.TaskGroups[0].Volumes = tc.volumeRequests

			callCounts := &callCounter{counts: map[string]int{}}
			mgr := mockPluginManager{mounter: mockVolumeMounter{
				hasMounts:         tc.startsWithValidMounts,
				callCounts:        callCounts,
				failsFirstUnmount: pointer.Of(tc.failsFirstUnmount),
			}}
			rpcer := mockRPCer{
				alloc:            alloc,
				callCounts:       callCounts,
				hasExistingClaim: pointer.Of(tc.startsWithClaims),
				schedulable:      pointer.Of(!tc.startsUnschedulable),
			}
			ar := mockAllocRunner{
				res: &cstructs.AllocHookResources{},
				caps: &drivers.Capabilities{
					FSIsolation:  drivers.FSIsolationChroot,
					MountConfigs: drivers.MountConfigSupportAll,
				},
				stubs: tc.startsWithStubs,
			}

			hook := newCSIHook(alloc, logger, mgr, rpcer, ar, ar.res, "secret")
			hook.minBackoffInterval = 1 * time.Millisecond
			hook.maxBackoffInterval = 10 * time.Millisecond
			hook.maxBackoffDuration = 500 * time.Millisecond

			must.NotNil(t, hook)

			if tc.expectedClaimErr != nil {
				must.EqError(t, hook.Prerun(), tc.expectedClaimErr.Error())
				mounts := ar.res.GetCSIMounts()
				must.Nil(t, mounts)
			} else {
				must.NoError(t, hook.Prerun())
				mounts := ar.res.GetCSIMounts()
				must.MapEq(t, tc.expectedMounts, mounts,
					must.Sprintf("got mounts: %v", mounts))
				must.NoError(t, hook.Postrun())
			}

			if tc.failsFirstUnmount {
				// retrying the unmount doesn't block Postrun, so give it time
				// to run once more before checking the call counts to ensure
				// this doesn't flake between 1 and 2 unmount/unpublish calls
				time.Sleep(100 * time.Millisecond)
			}

			counts := callCounts.get()
			must.MapEq(t, tc.expectedCalls, counts,
				must.Sprintf("got calls: %v", counts))

		})
	}

}

// TestCSIHook_Prerun_Validation tests that the validation of task capabilities
// in Prerun ensures at least one task supports CSI.
func TestCSIHook_Prerun_Validation(t *testing.T) {
	ci.Parallel(t)

	alloc := mock.Alloc()
	logger := testlog.HCLogger(t)
	volumeRequests := map[string]*structs.VolumeRequest{
		"vol0": {
			Name:           "vol0",
			Type:           structs.VolumeTypeCSI,
			Source:         "testvolume0",
			ReadOnly:       true,
			AccessMode:     structs.CSIVolumeAccessModeSingleNodeReader,
			AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
			MountOptions:   &structs.CSIMountOptions{},
			PerAlloc:       false,
		},
	}

	type testCase struct {
		name        string
		caps        *drivers.Capabilities
		capFunc     func() (*drivers.Capabilities, error)
		expectedErr string
	}

	testcases := []testCase{
		{
			name: "invalid - driver does not support CSI",
			caps: &drivers.Capabilities{
				MountConfigs: drivers.MountConfigSupportNone,
			},
			capFunc:     nil,
			expectedErr: "no task supports CSI",
		},

		{
			name: "invalid - driver error",
			caps: &drivers.Capabilities{},
			capFunc: func() (*drivers.Capabilities, error) {
				return nil, errors.New("error thrown by driver")
			},
			expectedErr: "could not validate task driver capabilities: error thrown by driver",
		},

		{
			name: "valid - driver supports CSI",
			caps: &drivers.Capabilities{
				MountConfigs: drivers.MountConfigSupportAll,
			},
			capFunc: nil,
		},
	}

	for _, tc := range testcases {
		t.Run(tc.name, func(t *testing.T) {
			alloc.Job.TaskGroups[0].Volumes = volumeRequests

			callCounts := &callCounter{counts: map[string]int{}}
			mgr := mockPluginManager{mounter: mockVolumeMounter{
				callCounts:        callCounts,
				failsFirstUnmount: pointer.Of(false),
			}}
			rpcer := mockRPCer{
				alloc:            alloc,
				callCounts:       callCounts,
				hasExistingClaim: pointer.Of(false),
				schedulable:      pointer.Of(true),
			}

			ar := mockAllocRunner{
				res:     &cstructs.AllocHookResources{},
				caps:    tc.caps,
				capFunc: tc.capFunc,
			}

			hook := newCSIHook(alloc, logger, mgr, rpcer, ar, ar.res, "secret")
			require.NotNil(t, hook)

			if tc.expectedErr != "" {
				require.EqualError(t, hook.Prerun(), tc.expectedErr)
				mounts := ar.res.GetCSIMounts()
				require.Nil(t, mounts)
			} else {
				require.NoError(t, hook.Prerun())
				mounts := ar.res.GetCSIMounts()
				require.NotNil(t, mounts)
				require.NoError(t, hook.Postrun())
			}
		})
	}
}

// HELPERS AND MOCKS

type callCounter struct {
	lock   sync.Mutex
	counts map[string]int
}

func (c *callCounter) inc(name string) {
	c.lock.Lock()
	defer c.lock.Unlock()
	c.counts[name]++
}

func (c *callCounter) get() map[string]int {
	c.lock.Lock()
	defer c.lock.Unlock()
	return maps.Clone(c.counts)
}

type mockRPCer struct {
	alloc            *structs.Allocation
	callCounts       *callCounter
	hasExistingClaim *bool
	schedulable      *bool
}

// RPC mocks the server RPCs, acting as though any request succeeds
func (r mockRPCer) RPC(method string, args any, reply any) error {
	switch method {
	case "CSIVolume.Claim":
		r.callCounts.inc("claim")
		req := args.(*structs.CSIVolumeClaimRequest)
		vol := r.testVolume(req.VolumeID)
		err := vol.Claim(req.ToClaim(), r.alloc)

		// after the first claim attempt is made, reset the volume's claims as
		// though it's been released from another node
		*r.hasExistingClaim = false
		*r.schedulable = true

		if err != nil {
			return err
		}

		resp := reply.(*structs.CSIVolumeClaimResponse)
		resp.PublishContext = map[string]string{}
		resp.Volume = vol
		resp.QueryMeta = structs.QueryMeta{}

	case "CSIVolume.Unpublish":
		r.callCounts.inc("unpublish")
		resp := reply.(*structs.CSIVolumeUnpublishResponse)
		resp.QueryMeta = structs.QueryMeta{}

	default:
		return fmt.Errorf("unexpected method")
	}
	return nil
}

// testVolume is a helper that optionally starts as unschedulable / claimed, so
// that we can test retryable vs non-retryable failures
func (r mockRPCer) testVolume(id string) *structs.CSIVolume {
	vol := structs.NewCSIVolume(id, 0)
	vol.Schedulable = *r.schedulable
	vol.PluginID = "plugin-" + id
	vol.RequestedCapabilities = []*structs.CSIVolumeCapability{
		{
			AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
			AccessMode:     structs.CSIVolumeAccessModeSingleNodeReader,
		},
		{
			AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
			AccessMode:     structs.CSIVolumeAccessModeSingleNodeWriter,
		},
	}

	if *r.hasExistingClaim {
		vol.AccessMode = structs.CSIVolumeAccessModeSingleNodeReader
		vol.AttachmentMode = structs.CSIVolumeAttachmentModeFilesystem
		vol.ReadClaims["another-alloc-id"] = &structs.CSIVolumeClaim{
			AllocationID:   "another-alloc-id",
			NodeID:         "another-node-id",
			Mode:           structs.CSIVolumeClaimRead,
			AccessMode:     structs.CSIVolumeAccessModeSingleNodeReader,
			AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
			State:          structs.CSIVolumeClaimStateTaken,
		}
	}

	return vol
}

type mockVolumeMounter struct {
	hasMounts         bool
	failsFirstUnmount *bool
	callCounts        *callCounter
}

func (vm mockVolumeMounter) MountVolume(ctx context.Context, vol *structs.CSIVolume, alloc *structs.Allocation, usageOpts *csimanager.UsageOptions, publishContext map[string]string) (*csimanager.MountInfo, error) {
	vm.callCounts.inc("mount")
	return &csimanager.MountInfo{
		Source: filepath.Join("test-alloc-dir", alloc.ID, vol.ID, usageOpts.ToFS()),
	}, nil
}

func (vm mockVolumeMounter) UnmountVolume(ctx context.Context, volID, remoteID, allocID string, usageOpts *csimanager.UsageOptions) error {
	vm.callCounts.inc("unmount")

	if *vm.failsFirstUnmount {
		*vm.failsFirstUnmount = false
		return fmt.Errorf("could not unmount")
	}

	return nil
}

func (vm mockVolumeMounter) HasMount(_ context.Context, mountInfo *csimanager.MountInfo) (bool, error) {
	vm.callCounts.inc("hasMount")
	return mountInfo != nil && vm.hasMounts, nil
}

func (vm mockVolumeMounter) ExternalID() string {
	return "i-example"
}

type mockPluginManager struct {
	mounter mockVolumeMounter
}

func (mgr mockPluginManager) WaitForPlugin(ctx context.Context, pluginType, pluginID string) error {
	return nil
}

func (mgr mockPluginManager) MounterForPlugin(ctx context.Context, pluginID string) (csimanager.VolumeMounter, error) {
	return mgr.mounter, nil
}

// no-op methods to fulfill the interface
func (mgr mockPluginManager) PluginManager() pluginmanager.PluginManager { return nil }
func (mgr mockPluginManager) Shutdown()                                  {}

type mockAllocRunner struct {
	res     *cstructs.AllocHookResources
	caps    *drivers.Capabilities
	capFunc func() (*drivers.Capabilities, error)

	stubs    map[string]*state.CSIVolumeStub
	stubFunc func() (map[string]*state.CSIVolumeStub, error)
}

func (ar mockAllocRunner) GetTaskDriverCapabilities(taskName string) (*drivers.Capabilities, error) {
	if ar.capFunc != nil {
		return ar.capFunc()
	}
	return ar.caps, nil
}

func (ar mockAllocRunner) SetCSIVolumes(stubs map[string]*state.CSIVolumeStub) error {
	ar.stubs = stubs
	return nil
}

func (ar mockAllocRunner) GetCSIVolumes() (map[string]*state.CSIVolumeStub, error) {
	if ar.stubFunc != nil {
		return ar.stubFunc()
	}
	return ar.stubs, nil
}