0a19fe3b60
We use capped exponential backoff in several places in the code when handling failures. The code we've copy-and-pasted all over has a check to see if the backoff is greater than the limit, but this check happens after the bitshift and we always increment the number of attempts. This causes an overflow with a fairly small number of failures (ex. at one place I tested it occurs after only 24 iterations), resulting in a negative backoff which then never recovers. The backoff becomes a tight loop consuming resources and/or DoS'ing a Nomad RPC handler or an external API such as Vault. Note this doesn't occur in places where we cap the number of iterations so the loop breaks (usually to return an error), so long as the number of iterations is reasonable. Introduce a helper with a check on the cap before the bitshift to avoid overflow in all places this can occur. Fixes: #18199 Co-authored-by: stswidwinski <stan.swidwinski@gmail.com>
524 lines
13 KiB
Go
524 lines
13 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: MPL-2.0
|
|
|
|
package devicemanager
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sync"
|
|
"time"
|
|
|
|
log "github.com/hashicorp/go-hclog"
|
|
multierror "github.com/hashicorp/go-multierror"
|
|
"github.com/hashicorp/nomad/helper"
|
|
"github.com/hashicorp/nomad/helper/pluginutils/loader"
|
|
"github.com/hashicorp/nomad/helper/pluginutils/singleton"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
"github.com/hashicorp/nomad/plugins/base"
|
|
bstructs "github.com/hashicorp/nomad/plugins/base/structs"
|
|
"github.com/hashicorp/nomad/plugins/device"
|
|
)
|
|
|
|
const (
|
|
// statsBackoffBaseline is the baseline time for exponential backoff while
|
|
// collecting device stats.
|
|
statsBackoffBaseline = 5 * time.Second
|
|
|
|
// statsBackoffLimit is the limit of the exponential backoff for collecting
|
|
// device statistics.
|
|
statsBackoffLimit = 30 * time.Minute
|
|
)
|
|
|
|
// instanceManagerConfig configures a device instance manager
|
|
type instanceManagerConfig struct {
|
|
// Logger is the logger used by the device instance manager
|
|
Logger log.Logger
|
|
|
|
// Ctx is used to shutdown the device instance manager
|
|
Ctx context.Context
|
|
|
|
// Loader is the plugin loader
|
|
Loader loader.PluginCatalog
|
|
|
|
// StoreReattach is used to store a plugins reattach config
|
|
StoreReattach StorePluginReattachFn
|
|
|
|
// PluginConfig is the config passed to the launched plugins
|
|
PluginConfig *base.AgentConfig
|
|
|
|
// Id is the ID of the plugin being managed
|
|
Id *loader.PluginID
|
|
|
|
// FingerprintOutCh is used to emit new fingerprinted devices
|
|
FingerprintOutCh chan<- struct{}
|
|
|
|
// StatsInterval is the interval at which we collect statistics.
|
|
StatsInterval time.Duration
|
|
}
|
|
|
|
// instanceManager is used to manage a single device plugin
|
|
type instanceManager struct {
|
|
// logger is the logger used by the device instance manager
|
|
logger log.Logger
|
|
|
|
// ctx is used to shutdown the device manager
|
|
ctx context.Context
|
|
|
|
// cancel is used to shutdown management of this device plugin
|
|
cancel context.CancelFunc
|
|
|
|
// loader is the plugin loader
|
|
loader loader.PluginCatalog
|
|
|
|
// storeReattach is used to store a plugins reattach config
|
|
storeReattach StorePluginReattachFn
|
|
|
|
// pluginConfig is the config passed to the launched plugins
|
|
pluginConfig *base.AgentConfig
|
|
|
|
// id is the ID of the plugin being managed
|
|
id *loader.PluginID
|
|
|
|
// fingerprintOutCh is used to emit new fingerprinted devices
|
|
fingerprintOutCh chan<- struct{}
|
|
|
|
// plugin is the plugin instance being managed
|
|
plugin loader.PluginInstance
|
|
|
|
// device is the device plugin being managed
|
|
device device.DevicePlugin
|
|
|
|
// pluginLock locks access to the device and plugin
|
|
pluginLock sync.Mutex
|
|
|
|
// shutdownLock is used to serialize attempts to shutdown
|
|
shutdownLock sync.Mutex
|
|
|
|
// devices is the set of fingerprinted devices
|
|
devices []*device.DeviceGroup
|
|
deviceLock sync.RWMutex
|
|
|
|
// statsInterval is the interval at which we collect statistics.
|
|
statsInterval time.Duration
|
|
|
|
// deviceStats is the set of statistics objects per devices
|
|
deviceStats []*device.DeviceGroupStats
|
|
deviceStatsLock sync.RWMutex
|
|
|
|
// firstFingerprintCh is used to trigger that we have successfully
|
|
// fingerprinted once. It is used to gate launching the stats collection.
|
|
firstFingerprintCh chan struct{}
|
|
hasFingerprinted bool
|
|
}
|
|
|
|
// newInstanceManager returns a new device instance manager. It is expected that
|
|
// the context passed in the configuration is cancelled in order to shutdown
|
|
// launched goroutines.
|
|
func newInstanceManager(c *instanceManagerConfig) *instanceManager {
|
|
|
|
ctx, cancel := context.WithCancel(c.Ctx)
|
|
i := &instanceManager{
|
|
logger: c.Logger.With("plugin", c.Id.Name),
|
|
ctx: ctx,
|
|
cancel: cancel,
|
|
loader: c.Loader,
|
|
storeReattach: c.StoreReattach,
|
|
pluginConfig: c.PluginConfig,
|
|
id: c.Id,
|
|
fingerprintOutCh: c.FingerprintOutCh,
|
|
statsInterval: c.StatsInterval,
|
|
firstFingerprintCh: make(chan struct{}),
|
|
}
|
|
|
|
go i.run()
|
|
return i
|
|
}
|
|
|
|
// HasDevices returns if the instance is managing the passed devices
|
|
func (i *instanceManager) HasDevices(d *structs.AllocatedDeviceResource) bool {
|
|
i.deviceLock.RLock()
|
|
defer i.deviceLock.RUnlock()
|
|
|
|
OUTER:
|
|
for _, dev := range i.devices {
|
|
if dev.Name != d.Name || dev.Type != d.Type || dev.Vendor != d.Vendor {
|
|
continue
|
|
}
|
|
|
|
// Check that we have all the requested devices
|
|
ids := make(map[string]struct{}, len(dev.Devices))
|
|
for _, inst := range dev.Devices {
|
|
ids[inst.ID] = struct{}{}
|
|
}
|
|
|
|
for _, reqID := range d.DeviceIDs {
|
|
if _, ok := ids[reqID]; !ok {
|
|
continue OUTER
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// AllStats returns all the device statistics returned by the device plugin.
|
|
func (i *instanceManager) AllStats() []*device.DeviceGroupStats {
|
|
i.deviceStatsLock.RLock()
|
|
defer i.deviceStatsLock.RUnlock()
|
|
return i.deviceStats
|
|
}
|
|
|
|
// DeviceStats returns the device statistics for the request devices.
|
|
func (i *instanceManager) DeviceStats(d *structs.AllocatedDeviceResource) *device.DeviceGroupStats {
|
|
i.deviceStatsLock.RLock()
|
|
defer i.deviceStatsLock.RUnlock()
|
|
|
|
// Find the device in question and then gather the instance statistics we
|
|
// are interested in
|
|
for _, group := range i.deviceStats {
|
|
if group.Vendor != d.Vendor || group.Type != d.Type || group.Name != d.Name {
|
|
continue
|
|
}
|
|
|
|
// We found the group we want so now grab the instance stats
|
|
out := &device.DeviceGroupStats{
|
|
Vendor: d.Vendor,
|
|
Type: d.Type,
|
|
Name: d.Name,
|
|
InstanceStats: make(map[string]*device.DeviceStats, len(d.DeviceIDs)),
|
|
}
|
|
|
|
for _, id := range d.DeviceIDs {
|
|
out.InstanceStats[id] = group.InstanceStats[id]
|
|
}
|
|
|
|
return out
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Reserve reserves the given devices
|
|
func (i *instanceManager) Reserve(d *structs.AllocatedDeviceResource) (*device.ContainerReservation, error) {
|
|
// Get a device plugin
|
|
devicePlugin, err := i.dispense()
|
|
if err != nil {
|
|
i.logger.Error("dispensing plugin failed", "error", err)
|
|
return nil, err
|
|
}
|
|
|
|
// Send the reserve request
|
|
return devicePlugin.Reserve(d.DeviceIDs)
|
|
}
|
|
|
|
// Devices returns the detected devices.
|
|
func (i *instanceManager) Devices() []*device.DeviceGroup {
|
|
i.deviceLock.RLock()
|
|
defer i.deviceLock.RUnlock()
|
|
return i.devices
|
|
}
|
|
|
|
// WaitForFirstFingerprint waits until either the plugin fingerprints, the
|
|
// passed context is done, or the plugin instance manager is shutdown.
|
|
func (i *instanceManager) WaitForFirstFingerprint(ctx context.Context) {
|
|
select {
|
|
case <-i.ctx.Done():
|
|
case <-ctx.Done():
|
|
case <-i.firstFingerprintCh:
|
|
}
|
|
}
|
|
|
|
// run is a long lived goroutine that starts the fingerprinting and stats
|
|
// collection goroutine and then shutsdown the plugin on exit.
|
|
func (i *instanceManager) run() {
|
|
// Dispense once to ensure we are given a valid plugin
|
|
if _, err := i.dispense(); err != nil {
|
|
i.logger.Error("dispensing initial plugin failed", "error", err)
|
|
return
|
|
}
|
|
|
|
// Create a waitgroup to block on shutdown for all created goroutines to
|
|
// exit
|
|
var wg sync.WaitGroup
|
|
|
|
// Start the fingerprinter
|
|
wg.Add(1)
|
|
go func() {
|
|
i.fingerprint()
|
|
wg.Done()
|
|
}()
|
|
|
|
// Wait for a valid result before starting stats collection
|
|
select {
|
|
case <-i.ctx.Done():
|
|
goto DONE
|
|
case <-i.firstFingerprintCh:
|
|
}
|
|
|
|
// Start stats
|
|
wg.Add(1)
|
|
go func() {
|
|
i.collectStats()
|
|
wg.Done()
|
|
}()
|
|
|
|
// Do a final cleanup
|
|
DONE:
|
|
wg.Wait()
|
|
i.cleanup()
|
|
}
|
|
|
|
// dispense is used to dispense a plugin.
|
|
func (i *instanceManager) dispense() (plugin device.DevicePlugin, err error) {
|
|
i.pluginLock.Lock()
|
|
defer i.pluginLock.Unlock()
|
|
|
|
// See if we already have a running instance
|
|
if i.plugin != nil && !i.plugin.Exited() {
|
|
return i.device, nil
|
|
}
|
|
|
|
// Get an instance of the plugin
|
|
pluginInstance, err := i.loader.Dispense(i.id.Name, i.id.PluginType, i.pluginConfig, i.logger)
|
|
if err != nil {
|
|
// Retry as the error just indicates the singleton has exited
|
|
if err == singleton.SingletonPluginExited {
|
|
pluginInstance, err = i.loader.Dispense(i.id.Name, i.id.PluginType, i.pluginConfig, i.logger)
|
|
}
|
|
|
|
// If we still have an error there is a real problem
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to start plugin: %v", err)
|
|
}
|
|
}
|
|
|
|
// Convert to a fingerprint plugin
|
|
device, ok := pluginInstance.Plugin().(device.DevicePlugin)
|
|
if !ok {
|
|
pluginInstance.Kill()
|
|
return nil, fmt.Errorf("plugin loaded does not implement the driver interface")
|
|
}
|
|
|
|
// Store the plugin and device
|
|
i.plugin = pluginInstance
|
|
i.device = device
|
|
|
|
// Store the reattach config
|
|
if c, ok := pluginInstance.ReattachConfig(); ok {
|
|
i.storeReattach(c)
|
|
}
|
|
|
|
return device, nil
|
|
}
|
|
|
|
// cleanup shutsdown the plugin
|
|
func (i *instanceManager) cleanup() {
|
|
i.shutdownLock.Lock()
|
|
i.pluginLock.Lock()
|
|
defer i.pluginLock.Unlock()
|
|
defer i.shutdownLock.Unlock()
|
|
|
|
if i.plugin != nil && !i.plugin.Exited() {
|
|
i.plugin.Kill()
|
|
i.storeReattach(nil)
|
|
}
|
|
}
|
|
|
|
// fingerprint is a long lived routine used to fingerprint the device
|
|
func (i *instanceManager) fingerprint() {
|
|
START:
|
|
// Get a device plugin
|
|
devicePlugin, err := i.dispense()
|
|
if err != nil {
|
|
i.logger.Error("dispensing plugin failed", "error", err)
|
|
i.cancel()
|
|
return
|
|
}
|
|
|
|
// Start fingerprinting
|
|
fingerprintCh, err := devicePlugin.Fingerprint(i.ctx)
|
|
if err == device.ErrPluginDisabled {
|
|
i.logger.Info("fingerprinting failed: plugin is not enabled")
|
|
i.handleFingerprintError()
|
|
return
|
|
} else if err != nil {
|
|
i.logger.Error("fingerprinting failed", "error", err)
|
|
i.handleFingerprintError()
|
|
return
|
|
}
|
|
|
|
var fresp *device.FingerprintResponse
|
|
var ok bool
|
|
for {
|
|
select {
|
|
case <-i.ctx.Done():
|
|
return
|
|
case fresp, ok = <-fingerprintCh:
|
|
}
|
|
|
|
if !ok {
|
|
i.logger.Trace("exiting since fingerprinting gracefully shutdown")
|
|
i.handleFingerprintError()
|
|
return
|
|
}
|
|
|
|
// Guard against error by the plugin
|
|
if fresp == nil {
|
|
continue
|
|
}
|
|
|
|
// Handle any errors
|
|
if fresp.Error != nil {
|
|
if fresp.Error == bstructs.ErrPluginShutdown {
|
|
i.logger.Error("plugin exited unexpectedly")
|
|
goto START
|
|
}
|
|
|
|
i.logger.Error("fingerprinting returned an error", "error", fresp.Error)
|
|
i.handleFingerprintError()
|
|
return
|
|
}
|
|
|
|
if err := i.handleFingerprint(fresp); err != nil {
|
|
// Cancel the context so we cleanup all goroutines
|
|
i.logger.Error("returned devices failed fingerprinting", "error", err)
|
|
i.handleFingerprintError()
|
|
}
|
|
}
|
|
}
|
|
|
|
// handleFingerprintError exits the manager and shutsdown the plugin.
|
|
func (i *instanceManager) handleFingerprintError() {
|
|
// Clear out the devices and trigger a node update
|
|
i.deviceLock.Lock()
|
|
defer i.deviceLock.Unlock()
|
|
|
|
// If we have fingerprinted before clear it out
|
|
if i.hasFingerprinted {
|
|
// Store the new devices
|
|
i.devices = nil
|
|
|
|
// Trigger that the we have new devices
|
|
select {
|
|
case i.fingerprintOutCh <- struct{}{}:
|
|
default:
|
|
}
|
|
}
|
|
|
|
// Cancel the context so we cleanup all goroutines
|
|
i.cancel()
|
|
}
|
|
|
|
// handleFingerprint stores the new devices and triggers the fingerprint output
|
|
// channel. An error is returned if the passed devices don't pass validation.
|
|
func (i *instanceManager) handleFingerprint(f *device.FingerprintResponse) error {
|
|
// Validate the received devices
|
|
var validationErr multierror.Error
|
|
for i, d := range f.Devices {
|
|
if err := d.Validate(); err != nil {
|
|
multierror.Append(&validationErr, multierror.Prefix(err, fmt.Sprintf("device group %d: ", i)))
|
|
}
|
|
}
|
|
|
|
if err := validationErr.ErrorOrNil(); err != nil {
|
|
return err
|
|
}
|
|
|
|
i.deviceLock.Lock()
|
|
defer i.deviceLock.Unlock()
|
|
|
|
// Store the new devices
|
|
i.devices = f.Devices
|
|
|
|
// Mark that we have received data
|
|
if !i.hasFingerprinted {
|
|
close(i.firstFingerprintCh)
|
|
i.hasFingerprinted = true
|
|
}
|
|
|
|
// Trigger that we have data to pull
|
|
select {
|
|
case i.fingerprintOutCh <- struct{}{}:
|
|
default:
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// collectStats is a long lived goroutine for collecting device statistics. It
|
|
// handles errors by backing off exponentially and retrying.
|
|
func (i *instanceManager) collectStats() {
|
|
var attempt uint64
|
|
var backoff time.Duration
|
|
|
|
START:
|
|
// Get a device plugin
|
|
devicePlugin, err := i.dispense()
|
|
if err != nil {
|
|
i.logger.Error("dispensing plugin failed", "error", err)
|
|
i.cancel()
|
|
return
|
|
}
|
|
|
|
// Start stats collection
|
|
statsCh, err := devicePlugin.Stats(i.ctx, i.statsInterval)
|
|
if err != nil {
|
|
i.logger.Error("stats collection failed", "error", err)
|
|
return
|
|
}
|
|
|
|
var sresp *device.StatsResponse
|
|
var ok bool
|
|
for {
|
|
select {
|
|
case <-i.ctx.Done():
|
|
return
|
|
case sresp, ok = <-statsCh:
|
|
}
|
|
|
|
if !ok {
|
|
i.logger.Trace("exiting since stats gracefully shutdown")
|
|
return
|
|
}
|
|
|
|
// Guard against error by the plugin
|
|
if sresp == nil {
|
|
continue
|
|
}
|
|
|
|
// Handle any errors
|
|
if sresp.Error != nil {
|
|
if sresp.Error == bstructs.ErrPluginShutdown {
|
|
i.logger.Error("plugin exited unexpectedly")
|
|
goto START
|
|
}
|
|
|
|
// Retry with an exponential backoff
|
|
backoff = helper.Backoff(statsBackoffBaseline, statsBackoffLimit, attempt)
|
|
attempt++
|
|
|
|
i.logger.Error("stats returned an error", "error", err, "retry", backoff)
|
|
|
|
select {
|
|
case <-i.ctx.Done():
|
|
return
|
|
case <-time.After(backoff):
|
|
goto START
|
|
}
|
|
}
|
|
|
|
// Reset the attempts since we got statistics
|
|
attempt = 0
|
|
|
|
// Store the new stats
|
|
if sresp.Groups != nil {
|
|
i.deviceStatsLock.Lock()
|
|
i.deviceStats = sresp.Groups
|
|
i.deviceStatsLock.Unlock()
|
|
}
|
|
}
|
|
}
|