759310d13a
The volume watcher design was based on deploymentwatcher and drainer, but has an important difference: we don't want to maintain a goroutine for the lifetime of the volume. So we stop the volumewatcher goroutine for a volume when that volume has no more claims to free. But the shutdown races with updates on the parent goroutine, and it's possible to drop updates. Fortunately these updates are picked up on the next core GC job, but we're most likely to hit this race when we're replacing an allocation and that's the time we least want to wait. Wait until the volume has "settled" before stopping this goroutine so that the race between shutdown and the parent goroutine sending on `<-updateCh` is pushed to after the window we most care about quick freeing of claims. * Fixes a resource leak when volumewatchers are no longer needed. The volume is nil and can't ever be started again, so the volume's `watcher` should be removed from the top-level `Watcher`. * De-flakes the GC job test: the test throws an error because the claimed node doesn't exist and is unreachable. This flaked instead of failed because we didn't correctly wait for the first pass through the volumewatcher. Make the GC job wait for the volumewatcher to reach the quiescent timeout window state before running the GC eval under test, so that we're sure the GC job's work isn't being picked up by processing one of the earlier claims. Update the claims used so that we're sure the GC pass won't hit a node unpublish error. * Adds trace logging to unpublish operations
200 lines
5 KiB
Go
200 lines
5 KiB
Go
package volumewatcher
|
|
|
|
import (
|
|
"context"
|
|
"sync"
|
|
"time"
|
|
|
|
log "github.com/hashicorp/go-hclog"
|
|
memdb "github.com/hashicorp/go-memdb"
|
|
"github.com/hashicorp/nomad/nomad/state"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
)
|
|
|
|
// Watcher is used to watch volumes and their allocations created
|
|
// by the scheduler and trigger the scheduler when allocation health
|
|
// transitions.
|
|
type Watcher struct {
|
|
enabled bool
|
|
logger log.Logger
|
|
|
|
// rpc contains the set of Server methods that can be used by
|
|
// the volumes watcher for RPC
|
|
rpc CSIVolumeRPC
|
|
|
|
// the ACL needed to send RPCs
|
|
leaderAcl string
|
|
|
|
// state is the state that is watched for state changes.
|
|
state *state.StateStore
|
|
|
|
// watchers is the set of active watchers, one per volume
|
|
watchers map[string]*volumeWatcher
|
|
|
|
// ctx and exitFn are used to cancel the watcher
|
|
ctx context.Context
|
|
exitFn context.CancelFunc
|
|
|
|
// quiescentTimeout is the time we wait until the volume has "settled"
|
|
// before stopping the child watcher goroutines
|
|
quiescentTimeout time.Duration
|
|
|
|
wlock sync.RWMutex
|
|
}
|
|
|
|
var defaultQuiescentTimeout = time.Minute * 5
|
|
|
|
// NewVolumesWatcher returns a volumes watcher that is used to watch
|
|
// volumes and trigger the scheduler as needed.
|
|
func NewVolumesWatcher(logger log.Logger, rpc CSIVolumeRPC, leaderAcl string) *Watcher {
|
|
|
|
// the leader step-down calls SetEnabled(false) which is what
|
|
// cancels this context, rather than passing in its own shutdown
|
|
// context
|
|
ctx, exitFn := context.WithCancel(context.Background())
|
|
|
|
return &Watcher{
|
|
rpc: rpc,
|
|
logger: logger.Named("volumes_watcher"),
|
|
ctx: ctx,
|
|
exitFn: exitFn,
|
|
leaderAcl: leaderAcl,
|
|
quiescentTimeout: defaultQuiescentTimeout,
|
|
}
|
|
}
|
|
|
|
// SetEnabled is used to control if the watcher is enabled. The
|
|
// watcher should only be enabled on the active leader. When being
|
|
// enabled the state and leader's ACL is passed in as it is no longer
|
|
// valid once a leader election has taken place.
|
|
func (w *Watcher) SetEnabled(enabled bool, state *state.StateStore, leaderAcl string) {
|
|
w.wlock.Lock()
|
|
defer w.wlock.Unlock()
|
|
|
|
wasEnabled := w.enabled
|
|
w.enabled = enabled
|
|
w.leaderAcl = leaderAcl
|
|
|
|
if state != nil {
|
|
w.state = state
|
|
}
|
|
|
|
// Flush the state to create the necessary objects
|
|
w.flush(enabled)
|
|
|
|
// If we are starting now, launch the watch daemon
|
|
if enabled && !wasEnabled {
|
|
go w.watchVolumes(w.ctx)
|
|
}
|
|
}
|
|
|
|
// flush is used to clear the state of the watcher
|
|
func (w *Watcher) flush(enabled bool) {
|
|
// Stop all the watchers and clear it
|
|
for _, watcher := range w.watchers {
|
|
watcher.Stop()
|
|
}
|
|
|
|
// Kill everything associated with the watcher
|
|
if w.exitFn != nil {
|
|
w.exitFn()
|
|
}
|
|
|
|
w.watchers = make(map[string]*volumeWatcher, 32)
|
|
w.ctx, w.exitFn = context.WithCancel(context.Background())
|
|
}
|
|
|
|
// watchVolumes is the long lived go-routine that watches for volumes to
|
|
// add and remove watchers on.
|
|
func (w *Watcher) watchVolumes(ctx context.Context) {
|
|
vIndex := uint64(1)
|
|
for {
|
|
volumes, idx, err := w.getVolumes(ctx, vIndex)
|
|
if err != nil {
|
|
if err == context.Canceled {
|
|
return
|
|
}
|
|
w.logger.Error("failed to retrieve volumes", "error", err)
|
|
}
|
|
|
|
vIndex = idx // last-seen index
|
|
for _, v := range volumes {
|
|
if err := w.add(v); err != nil {
|
|
w.logger.Error("failed to track volume", "volume_id", v.ID, "error", err)
|
|
}
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
// getVolumes retrieves all volumes blocking at the given index.
|
|
func (w *Watcher) getVolumes(ctx context.Context, minIndex uint64) ([]*structs.CSIVolume, uint64, error) {
|
|
resp, index, err := w.state.BlockingQuery(w.getVolumesImpl, minIndex, ctx)
|
|
if err != nil {
|
|
return nil, 0, err
|
|
}
|
|
|
|
return resp.([]*structs.CSIVolume), index, nil
|
|
}
|
|
|
|
// getVolumesImpl retrieves all volumes from the passed state store.
|
|
func (w *Watcher) getVolumesImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
|
|
|
|
iter, err := state.CSIVolumes(ws)
|
|
if err != nil {
|
|
return nil, 0, err
|
|
}
|
|
|
|
var volumes []*structs.CSIVolume
|
|
for {
|
|
raw := iter.Next()
|
|
if raw == nil {
|
|
break
|
|
}
|
|
volume := raw.(*structs.CSIVolume)
|
|
volumes = append(volumes, volume)
|
|
}
|
|
|
|
// Use the last index that affected the volume table
|
|
index, err := state.Index("csi_volumes")
|
|
if err != nil {
|
|
return nil, 0, err
|
|
}
|
|
|
|
return volumes, index, nil
|
|
}
|
|
|
|
// add adds a volume to the watch list
|
|
func (w *Watcher) add(v *structs.CSIVolume) error {
|
|
w.wlock.Lock()
|
|
defer w.wlock.Unlock()
|
|
_, err := w.addLocked(v)
|
|
return err
|
|
}
|
|
|
|
// addLocked adds a volume to the watch list and should only be called when
|
|
// locked. Creating the volumeWatcher starts a go routine to .watch() it
|
|
func (w *Watcher) addLocked(v *structs.CSIVolume) (*volumeWatcher, error) {
|
|
// Not enabled so no-op
|
|
if !w.enabled {
|
|
return nil, nil
|
|
}
|
|
|
|
// Already watched so trigger an update for the volume
|
|
if watcher, ok := w.watchers[v.ID+v.Namespace]; ok {
|
|
watcher.Notify(v)
|
|
return nil, nil
|
|
}
|
|
|
|
watcher := newVolumeWatcher(w, v)
|
|
w.watchers[v.ID+v.Namespace] = watcher
|
|
return watcher, nil
|
|
}
|
|
|
|
// removes a volume from the watch list
|
|
func (w *Watcher) remove(volID string) {
|
|
w.wlock.Lock()
|
|
defer w.wlock.Unlock()
|
|
delete(w.watchers, volID)
|
|
}
|