400 lines
11 KiB
Go
400 lines
11 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: MPL-2.0
|
|
|
|
package drivermanager
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sync"
|
|
|
|
log "github.com/hashicorp/go-hclog"
|
|
plugin "github.com/hashicorp/go-plugin"
|
|
"github.com/hashicorp/nomad/client/pluginmanager"
|
|
"github.com/hashicorp/nomad/client/pluginmanager/drivermanager/state"
|
|
"github.com/hashicorp/nomad/helper/pluginutils/loader"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
"github.com/hashicorp/nomad/plugins/base"
|
|
"github.com/hashicorp/nomad/plugins/drivers"
|
|
pstructs "github.com/hashicorp/nomad/plugins/shared/structs"
|
|
)
|
|
|
|
// ErrDriverNotFound is returned during Dispense when the requested driver
|
|
// plugin is not found in the plugin catalog
|
|
var ErrDriverNotFound = fmt.Errorf("driver not found")
|
|
|
|
// Manager is the interface used to manage driver plugins
|
|
type Manager interface {
|
|
pluginmanager.PluginManager
|
|
|
|
// Dispense returns a drivers.DriverPlugin for the given driver plugin name
|
|
// handling reattaching to an existing driver if available
|
|
Dispense(driver string) (drivers.DriverPlugin, error)
|
|
}
|
|
|
|
// TaskExecHandler is function to be called for executing commands in a task
|
|
type TaskExecHandler func(
|
|
ctx context.Context,
|
|
command []string,
|
|
tty bool,
|
|
stream drivers.ExecTaskStream) error
|
|
|
|
// EventHandler is a callback to be called for a task.
|
|
// The handler should not block execution.
|
|
type EventHandler func(*drivers.TaskEvent)
|
|
|
|
// TaskEventHandlerFactory returns an event handler for a given allocID/task name
|
|
type TaskEventHandlerFactory func(allocID, taskName string) EventHandler
|
|
|
|
// StateStorage is used to persist the driver managers state across
|
|
// agent restarts.
|
|
type StateStorage interface {
|
|
// GetDevicePluginState is used to retrieve the device manager's plugin
|
|
// state.
|
|
GetDriverPluginState() (*state.PluginState, error)
|
|
|
|
// PutDevicePluginState is used to store the device manager's plugin
|
|
// state.
|
|
PutDriverPluginState(state *state.PluginState) error
|
|
}
|
|
|
|
// UpdateNodeDriverInfoFn is the callback used to update the node from
|
|
// fingerprinting
|
|
type UpdateNodeDriverInfoFn func(string, *structs.DriverInfo)
|
|
|
|
// StorePluginReattachFn is used to store plugin reattachment configurations.
|
|
type StorePluginReattachFn func(*plugin.ReattachConfig) error
|
|
|
|
// FetchPluginReattachFn is used to retrieve the stored plugin reattachment
|
|
// configuration.
|
|
type FetchPluginReattachFn func() (*plugin.ReattachConfig, bool)
|
|
|
|
// Config is used to configure a driver manager
|
|
type Config struct {
|
|
// Logger is the logger used by the device manager
|
|
Logger log.Logger
|
|
|
|
// Loader is the plugin loader
|
|
Loader loader.PluginCatalog
|
|
|
|
// PluginConfig is the config passed to the launched plugins
|
|
PluginConfig *base.AgentConfig
|
|
|
|
// Updater is used to update the node when driver information changes
|
|
Updater UpdateNodeDriverInfoFn
|
|
|
|
// EventHandlerFactory is used to retrieve a task event handler
|
|
EventHandlerFactory TaskEventHandlerFactory
|
|
|
|
// State is used to manage the device managers state
|
|
State StateStorage
|
|
|
|
// AllowedDrivers if set will only start driver plugins for the given
|
|
// drivers
|
|
AllowedDrivers map[string]struct{}
|
|
|
|
// BlockedDrivers if set will not allow the given driver plugins to start
|
|
BlockedDrivers map[string]struct{}
|
|
}
|
|
|
|
// manager is used to manage a set of driver plugins
|
|
type manager struct {
|
|
// logger is the logger used by the device manager
|
|
logger log.Logger
|
|
|
|
// state is used to manage the device managers state
|
|
state StateStorage
|
|
|
|
// ctx is used to shutdown the device manager
|
|
ctx context.Context
|
|
cancel context.CancelFunc
|
|
|
|
// loader is the plugin loader
|
|
loader loader.PluginCatalog
|
|
|
|
// pluginConfig is the config passed to the launched plugins
|
|
pluginConfig *base.AgentConfig
|
|
|
|
// updater is used to update the node when device information changes
|
|
updater UpdateNodeDriverInfoFn
|
|
|
|
// eventHandlerFactory is passed to the instance managers and used to forward
|
|
// task events
|
|
eventHandlerFactory TaskEventHandlerFactory
|
|
|
|
// instances is the list of managed devices, access is serialized by instanceMu
|
|
instances map[string]*instanceManager
|
|
instancesMu sync.RWMutex
|
|
|
|
// reattachConfigs stores the plugin reattach configs
|
|
reattachConfigs map[loader.PluginID]*pstructs.ReattachConfig
|
|
reattachConfigLock sync.Mutex
|
|
|
|
// allows/block lists
|
|
allowedDrivers map[string]struct{}
|
|
blockedDrivers map[string]struct{}
|
|
|
|
// readyCh is ticked once at the end of Run()
|
|
readyCh chan struct{}
|
|
}
|
|
|
|
// New returns a new driver manager
|
|
func New(c *Config) *manager {
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
return &manager{
|
|
logger: c.Logger.Named("driver_mgr"),
|
|
state: c.State,
|
|
ctx: ctx,
|
|
cancel: cancel,
|
|
loader: c.Loader,
|
|
pluginConfig: c.PluginConfig,
|
|
updater: c.Updater,
|
|
eventHandlerFactory: c.EventHandlerFactory,
|
|
instances: make(map[string]*instanceManager),
|
|
reattachConfigs: make(map[loader.PluginID]*pstructs.ReattachConfig),
|
|
allowedDrivers: c.AllowedDrivers,
|
|
blockedDrivers: c.BlockedDrivers,
|
|
readyCh: make(chan struct{}),
|
|
}
|
|
}
|
|
|
|
// PluginType returns the type of plugin this manager mananges
|
|
func (*manager) PluginType() string { return base.PluginTypeDriver }
|
|
|
|
// Run starts the manager, initializes driver plugins and blocks until Shutdown
|
|
// is called.
|
|
func (m *manager) Run() {
|
|
// Load any previous plugin reattach configuration
|
|
if err := m.loadReattachConfigs(); err != nil {
|
|
m.logger.Warn("unable to load driver plugin reattach configs, a driver process may have been leaked",
|
|
"error", err)
|
|
}
|
|
|
|
// Get driver plugins
|
|
driversPlugins := m.loader.Catalog()[base.PluginTypeDriver]
|
|
if len(driversPlugins) == 0 {
|
|
m.logger.Debug("exiting since there are no driver plugins")
|
|
m.cancel()
|
|
return
|
|
}
|
|
|
|
var skippedDrivers []string
|
|
for _, d := range driversPlugins {
|
|
id := loader.PluginInfoID(d)
|
|
if m.isDriverBlocked(id.Name) {
|
|
skippedDrivers = append(skippedDrivers, id.Name)
|
|
continue
|
|
}
|
|
|
|
storeFn := func(c *plugin.ReattachConfig) error {
|
|
return m.storePluginReattachConfig(id, c)
|
|
}
|
|
fetchFn := func() (*plugin.ReattachConfig, bool) {
|
|
return m.fetchPluginReattachConfig(id)
|
|
}
|
|
|
|
instance := newInstanceManager(&instanceManagerConfig{
|
|
Logger: m.logger,
|
|
Ctx: m.ctx,
|
|
Loader: m.loader,
|
|
StoreReattach: storeFn,
|
|
FetchReattach: fetchFn,
|
|
PluginConfig: m.pluginConfig,
|
|
ID: &id,
|
|
UpdateNodeFromDriver: m.updater,
|
|
EventHandlerFactory: m.eventHandlerFactory,
|
|
})
|
|
|
|
m.instancesMu.Lock()
|
|
m.instances[id.Name] = instance
|
|
m.instancesMu.Unlock()
|
|
}
|
|
|
|
if len(skippedDrivers) > 0 {
|
|
m.logger.Debug("drivers skipped due to allow/block list", "skipped_drivers", skippedDrivers)
|
|
}
|
|
|
|
// signal ready
|
|
close(m.readyCh)
|
|
}
|
|
|
|
// Shutdown cleans up all the plugins
|
|
func (m *manager) Shutdown() {
|
|
// Cancel the context to stop any requests
|
|
m.cancel()
|
|
|
|
m.instancesMu.RLock()
|
|
defer m.instancesMu.RUnlock()
|
|
|
|
// Go through and shut everything down
|
|
for _, i := range m.instances {
|
|
i.cleanup()
|
|
}
|
|
}
|
|
|
|
func (m *manager) WaitForFirstFingerprint(ctx context.Context) <-chan struct{} {
|
|
ctx, cancel := context.WithCancel(ctx)
|
|
go m.waitForFirstFingerprint(ctx, cancel)
|
|
return ctx.Done()
|
|
}
|
|
|
|
func (m *manager) waitForFirstFingerprint(ctx context.Context, cancel context.CancelFunc) {
|
|
defer cancel()
|
|
// We don't want to start initial fingerprint wait until Run loop has
|
|
// finished
|
|
select {
|
|
case <-m.readyCh:
|
|
case <-ctx.Done():
|
|
// parent context canceled or timedout
|
|
return
|
|
case <-m.ctx.Done():
|
|
// shutdown called
|
|
return
|
|
}
|
|
|
|
var mu sync.Mutex
|
|
driversByStatus := map[drivers.HealthState][]string{}
|
|
|
|
var wg sync.WaitGroup
|
|
|
|
recordDriver := func(name string, lastHeath drivers.HealthState) {
|
|
mu.Lock()
|
|
defer mu.Unlock()
|
|
|
|
updated := append(driversByStatus[lastHeath], name) //nolint:gocritic
|
|
driversByStatus[lastHeath] = updated
|
|
}
|
|
|
|
// loop through instances and wait for each to finish initial fingerprint
|
|
m.instancesMu.RLock()
|
|
for n, i := range m.instances {
|
|
wg.Add(1)
|
|
go func(name string, instance *instanceManager) {
|
|
defer wg.Done()
|
|
instance.WaitForFirstFingerprint(ctx)
|
|
recordDriver(name, instance.getLastHealth())
|
|
}(n, i)
|
|
}
|
|
m.instancesMu.RUnlock()
|
|
wg.Wait()
|
|
|
|
m.logger.Debug("detected drivers", "drivers", driversByStatus)
|
|
}
|
|
|
|
func (m *manager) loadReattachConfigs() error {
|
|
m.reattachConfigLock.Lock()
|
|
defer m.reattachConfigLock.Unlock()
|
|
|
|
s, err := m.state.GetDriverPluginState()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if s != nil {
|
|
for name, c := range s.ReattachConfigs {
|
|
if m.isDriverBlocked(name) {
|
|
m.logger.Warn("reattach config for driver plugin found but driver is blocked due to allow/block list, killing plugin",
|
|
"driver", name)
|
|
m.shutdownBlockedDriver(name, c)
|
|
continue
|
|
}
|
|
|
|
id := loader.PluginID{
|
|
PluginType: base.PluginTypeDriver,
|
|
Name: name,
|
|
}
|
|
|
|
m.reattachConfigs[id] = c
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// shutdownBlockedDriver is used to forcefully shutdown a running driver plugin
|
|
// when it has been blocked due to allow/block lists
|
|
func (m *manager) shutdownBlockedDriver(name string, reattach *pstructs.ReattachConfig) {
|
|
c, err := pstructs.ReattachConfigToGoPlugin(reattach)
|
|
if err != nil {
|
|
m.logger.Warn("failed to reattach and kill blocked driver plugin",
|
|
"driver", name, "error", err)
|
|
return
|
|
|
|
}
|
|
pluginInstance, err := m.loader.Reattach(name, base.PluginTypeDriver, c)
|
|
if err != nil {
|
|
m.logger.Warn("failed to reattach and kill blocked driver plugin",
|
|
"driver", name, "error", err)
|
|
return
|
|
}
|
|
|
|
if !pluginInstance.Exited() {
|
|
pluginInstance.Kill()
|
|
}
|
|
}
|
|
|
|
// storePluginReattachConfig is used as a callback to the instance managers and
|
|
// persists thhe plugin reattach configurations.
|
|
func (m *manager) storePluginReattachConfig(id loader.PluginID, c *plugin.ReattachConfig) error {
|
|
m.reattachConfigLock.Lock()
|
|
defer m.reattachConfigLock.Unlock()
|
|
|
|
if c == nil {
|
|
delete(m.reattachConfigs, id)
|
|
} else {
|
|
// Store the new reattach config
|
|
m.reattachConfigs[id] = pstructs.ReattachConfigFromGoPlugin(c)
|
|
}
|
|
// Persist the state
|
|
s := &state.PluginState{
|
|
ReattachConfigs: make(map[string]*pstructs.ReattachConfig, len(m.reattachConfigs)),
|
|
}
|
|
|
|
for id, c := range m.reattachConfigs {
|
|
s.ReattachConfigs[id.Name] = c
|
|
}
|
|
|
|
return m.state.PutDriverPluginState(s)
|
|
}
|
|
|
|
// fetchPluginReattachConfig is used as a callback to the instance managers and
|
|
// retrieves the plugin reattach config. If it has not been stored it will
|
|
// return nil
|
|
func (m *manager) fetchPluginReattachConfig(id loader.PluginID) (*plugin.ReattachConfig, bool) {
|
|
m.reattachConfigLock.Lock()
|
|
defer m.reattachConfigLock.Unlock()
|
|
|
|
if cfg, ok := m.reattachConfigs[id]; ok {
|
|
c, err := pstructs.ReattachConfigToGoPlugin(cfg)
|
|
if err != nil {
|
|
m.logger.Warn("failed to read plugin reattach config", "config", cfg, "error", err)
|
|
delete(m.reattachConfigs, id)
|
|
return nil, false
|
|
}
|
|
return c, true
|
|
}
|
|
return nil, false
|
|
}
|
|
|
|
func (m *manager) Dispense(d string) (drivers.DriverPlugin, error) {
|
|
m.instancesMu.RLock()
|
|
defer m.instancesMu.RUnlock()
|
|
if instance, ok := m.instances[d]; ok {
|
|
return instance.dispense()
|
|
}
|
|
|
|
return nil, ErrDriverNotFound
|
|
}
|
|
|
|
func (m *manager) isDriverBlocked(name string) bool {
|
|
// Block drivers that are not in the allowed list if it is set.
|
|
if _, ok := m.allowedDrivers[name]; len(m.allowedDrivers) > 0 && !ok {
|
|
return true
|
|
}
|
|
|
|
// Block drivers that are in the blocked list
|
|
if _, ok := m.blockedDrivers[name]; ok {
|
|
return true
|
|
}
|
|
return false
|
|
}
|