open-nomad/client/pluginmanager/drivermanager/instance.go

484 lines
13 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package drivermanager
import (
"context"
"fmt"
"sync"
"time"
log "github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/helper/pluginutils/loader"
"github.com/hashicorp/nomad/helper/pluginutils/singleton"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/plugins/base"
bstructs "github.com/hashicorp/nomad/plugins/base/structs"
"github.com/hashicorp/nomad/plugins/drivers"
)
const (
// driverFPBackoffBaseline is the baseline time for exponential backoff while
// fingerprinting a driver.
driverFPBackoffBaseline = 5 * time.Second
// driverFPBackoffLimit is the limit of the exponential backoff for fingerprinting
// a driver.
driverFPBackoffLimit = 2 * time.Minute
)
// instanceManagerConfig configures a driver instance manager
type instanceManagerConfig struct {
// Logger is the logger used by the driver instance manager
Logger log.Logger
// Ctx is used to shutdown the driver instance manager
Ctx context.Context
// Loader is the plugin loader
Loader loader.PluginCatalog
// StoreReattach is used to store a plugins reattach config
StoreReattach StorePluginReattachFn
// FetchReattach is used to retrieve a plugin's reattach config
FetchReattach FetchPluginReattachFn
// PluginConfig is the config passed to the launched plugins
PluginConfig *base.AgentConfig
// ID is the ID of the plugin being managed
ID *loader.PluginID
// updateNodeFromDriver is the callback used to update the node from fingerprinting
UpdateNodeFromDriver UpdateNodeDriverInfoFn
// EventHandlerFactory is used to fetch a task event handler
EventHandlerFactory TaskEventHandlerFactory
}
// instanceManager is used to manage a single driver plugin
type instanceManager struct {
// logger is the logger used by the driver instance manager
logger log.Logger
// ctx is used to shutdown the driver manager
ctx context.Context
// cancel is used to shutdown management of this driver plugin
cancel context.CancelFunc
// loader is the plugin loader
loader loader.PluginCatalog
// storeReattach is used to store a plugins reattach config
storeReattach StorePluginReattachFn
// fetchReattach is used to retrieve a plugin's reattach config
fetchReattach FetchPluginReattachFn
// pluginConfig is the config passed to the launched plugins
pluginConfig *base.AgentConfig
// id is the ID of the plugin being managed
id *loader.PluginID
// plugin is the plugin instance being managed
plugin loader.PluginInstance
// driver is the driver plugin being managed
driver drivers.DriverPlugin
// pluginLock locks access to the driver and plugin
pluginLock sync.Mutex
// shutdownLock is used to serialize attempts to shutdown
shutdownLock sync.Mutex
// updateNodeFromDriver is the callback used to update the node from fingerprinting
updateNodeFromDriver UpdateNodeDriverInfoFn
// eventHandlerFactory is used to fetch a handler for a task event
eventHandlerFactory TaskEventHandlerFactory
// firstFingerprintCh is used to trigger that we have successfully
// fingerprinted once. It is used to gate launching the stats collection.
firstFingerprintCh chan struct{}
hasFingerprinted bool
// lastHealthState is the last known health fingerprinted by the manager
lastHealthState drivers.HealthState
lastHealthStateMu sync.Mutex
}
// newInstanceManager returns a new driver instance manager. It is expected that
// the context passed in the configuration is cancelled in order to shutdown
// launched goroutines.
func newInstanceManager(c *instanceManagerConfig) *instanceManager {
ctx, cancel := context.WithCancel(c.Ctx)
i := &instanceManager{
logger: c.Logger.With("driver", c.ID.Name),
ctx: ctx,
cancel: cancel,
loader: c.Loader,
storeReattach: c.StoreReattach,
fetchReattach: c.FetchReattach,
pluginConfig: c.PluginConfig,
id: c.ID,
updateNodeFromDriver: c.UpdateNodeFromDriver,
eventHandlerFactory: c.EventHandlerFactory,
firstFingerprintCh: make(chan struct{}),
}
go i.run()
return i
}
// WaitForFirstFingerprint waits until either the plugin fingerprints, the
// passed context is done, or the plugin instance manager is shutdown.
func (i *instanceManager) WaitForFirstFingerprint(ctx context.Context) {
select {
case <-i.ctx.Done():
case <-ctx.Done():
case <-i.firstFingerprintCh:
}
}
// run is a long lived goroutine that starts the fingerprinting and stats
// collection goroutine and then shutsdown the plugin on exit.
func (i *instanceManager) run() {
// Dispense once to ensure we are given a valid plugin
if _, err := i.dispense(); err != nil {
i.logger.Error("dispensing initial plugin failed", "error", err)
return
}
// Create a waitgroup to block on shutdown for all created goroutines to
// exit
var wg sync.WaitGroup
// Start the fingerprinter
wg.Add(1)
go func() {
i.fingerprint()
wg.Done()
}()
// Start event handler
wg.Add(1)
go func() {
i.handleEvents()
wg.Done()
}()
// Do a final cleanup
wg.Wait()
i.cleanup()
}
// dispense is used to dispense a plugin.
func (i *instanceManager) dispense() (plugin drivers.DriverPlugin, err error) {
i.pluginLock.Lock()
defer i.pluginLock.Unlock()
// See if we already have a running instance
if i.plugin != nil && !i.plugin.Exited() {
return i.driver, nil
}
var pluginInstance loader.PluginInstance
dispenseFn := func() (loader.PluginInstance, error) {
return i.loader.Dispense(i.id.Name, i.id.PluginType, i.pluginConfig, i.logger)
}
if reattach, ok := i.fetchReattach(); ok {
// Reattach to existing plugin
pluginInstance, err = i.loader.Reattach(i.id.Name, i.id.PluginType, reattach)
// If reattachment fails, get a new plugin instance
if err != nil {
i.logger.Warn("failed to reattach to plugin, starting new instance", "error", err)
pluginInstance, err = dispenseFn()
}
} else {
// Get an instance of the plugin
pluginInstance, err = dispenseFn()
}
if err != nil {
// Retry as the error just indicates the singleton has exited
if err == singleton.SingletonPluginExited {
pluginInstance, err = dispenseFn()
}
// If we still have an error there is a real problem
if err != nil {
return nil, fmt.Errorf("failed to start plugin: %v", err)
}
}
// Convert to a driver plugin
driver, ok := pluginInstance.Plugin().(drivers.DriverPlugin)
if !ok {
pluginInstance.Kill()
return nil, fmt.Errorf("plugin loaded does not implement the driver interface")
}
// Store the plugin and driver
i.plugin = pluginInstance
i.driver = driver
// Store the reattach config
if c, ok := pluginInstance.ReattachConfig(); ok {
if err := i.storeReattach(c); err != nil {
i.logger.Error("error storing driver plugin reattach config", "error", err)
}
}
return driver, nil
}
// cleanup shutsdown the plugin
func (i *instanceManager) cleanup() {
i.shutdownLock.Lock()
i.pluginLock.Lock()
defer i.pluginLock.Unlock()
defer i.shutdownLock.Unlock()
if i.plugin == nil {
return
}
if !i.plugin.Exited() {
i.plugin.Kill()
if err := i.storeReattach(nil); err != nil {
i.logger.Warn("error clearing plugin reattach config from state store", "error", err)
}
}
i.cancel()
}
// dispenseFingerprintCh dispenses a driver and makes a Fingerprint RPC call
// to the driver. The fingerprint chan is returned along with the cancel func
// for the context used in the RPC. This cancel func should always be called
// when the caller is finished with the channel.
func (i *instanceManager) dispenseFingerprintCh() (<-chan *drivers.Fingerprint, context.CancelFunc, error) {
driver, err := i.dispense()
if err != nil {
return nil, nil, err
}
ctx, cancel := context.WithCancel(i.ctx)
fingerCh, err := driver.Fingerprint(ctx)
if err != nil {
cancel()
return nil, nil, err
}
return fingerCh, cancel, nil
}
// fingerprint is the main loop for fingerprinting.
func (i *instanceManager) fingerprint() {
fpChan, cancel, err := i.dispenseFingerprintCh()
if err != nil {
i.logger.Error("failed to dispense driver plugin", "error", err)
}
// backoff and retry used if the RPC is closed by the other end
var backoff time.Duration
var retry uint64
for {
if backoff > 0 {
select {
case <-time.After(backoff):
case <-i.ctx.Done():
cancel()
return
}
}
select {
case <-i.ctx.Done():
cancel()
return
case fp, ok := <-fpChan:
if ok {
if fp.Err == nil {
i.handleFingerprint(fp)
} else {
i.logger.Warn("received fingerprint error from driver", "error", fp.Err)
i.handleFingerprintError()
}
continue
}
// avoid fingerprinting again if ctx and fpChan both close
if i.ctx.Err() != nil {
cancel()
return
}
// if the channel is closed attempt to open a new one
newFpChan, newCancel, err := i.dispenseFingerprintCh()
if err != nil {
i.logger.Warn("error fingerprinting driver", "error", err, "retry", retry)
i.handleFingerprintError()
// Calculate the new backoff
backoff = helper.Backoff(driverFPBackoffBaseline, driverFPBackoffLimit, retry)
retry++
continue
}
cancel()
fpChan = newFpChan
cancel = newCancel
// Reset backoff
backoff = 0
retry = 0
}
}
}
// handleFingerprintError is called when an error occurred while fingerprinting
// and will set the driver to unhealthy
func (i *instanceManager) handleFingerprintError() {
di := &structs.DriverInfo{
Healthy: false,
HealthDescription: "failed to fingerprint driver",
UpdateTime: time.Now(),
}
i.updateNodeFromDriver(i.id.Name, di)
}
// handleFingerprint updates the node with the current fingerprint status
func (i *instanceManager) handleFingerprint(fp *drivers.Fingerprint) {
attrs := make(map[string]string, len(fp.Attributes))
for key, attr := range fp.Attributes {
attrs[key] = attr.GoString()
}
di := &structs.DriverInfo{
Attributes: attrs,
Detected: fp.Health != drivers.HealthStateUndetected,
Healthy: fp.Health == drivers.HealthStateHealthy,
HealthDescription: fp.HealthDescription,
UpdateTime: time.Now(),
}
i.updateNodeFromDriver(i.id.Name, di)
// log detected/undetected state changes after the initial fingerprint
i.lastHealthStateMu.Lock()
if i.hasFingerprinted {
if i.lastHealthState != fp.Health {
i.logger.Info("driver health state has changed", "previous", i.lastHealthState, "current", fp.Health, "description", fp.HealthDescription)
}
}
i.lastHealthState = fp.Health
i.lastHealthStateMu.Unlock()
// if this is the first fingerprint, mark that we have received it
if !i.hasFingerprinted {
i.logger.Debug("initial driver fingerprint", "health", fp.Health, "description", fp.HealthDescription)
close(i.firstFingerprintCh)
i.hasFingerprinted = true
}
}
// getLastHealth returns the most recent HealthState from fingerprinting
func (i *instanceManager) getLastHealth() drivers.HealthState {
i.lastHealthStateMu.Lock()
defer i.lastHealthStateMu.Unlock()
return i.lastHealthState
}
// dispenseTaskEventsCh dispenses a driver plugin and makes a TaskEvents RPC.
// The TaskEvent chan and cancel func for the RPC is return. The cancel func must
// be called by the caller to properly cleanup the context
func (i *instanceManager) dispenseTaskEventsCh() (<-chan *drivers.TaskEvent, context.CancelFunc, error) {
driver, err := i.dispense()
if err != nil {
return nil, nil, err
}
ctx, cancel := context.WithCancel(i.ctx)
eventsCh, err := driver.TaskEvents(ctx)
if err != nil {
cancel()
return nil, nil, err
}
return eventsCh, cancel, nil
}
// handleEvents is the main loop that receives task events from the driver
func (i *instanceManager) handleEvents() {
eventsCh, cancel, err := i.dispenseTaskEventsCh()
if err != nil {
i.logger.Error("failed to dispense driver", "error", err)
}
var backoff time.Duration
var retry uint64
for {
if backoff > 0 {
select {
case <-time.After(backoff):
case <-i.ctx.Done():
cancel()
return
}
}
select {
case <-i.ctx.Done():
cancel()
return
case ev, ok := <-eventsCh:
if ok {
i.handleEvent(ev)
continue
}
// if the channel is closed attempt to open a new one
newEventsChan, newCancel, err := i.dispenseTaskEventsCh()
if err != nil {
i.logger.Warn("failed to receive task events, retrying", "error", err, "retry", retry)
// Calculate the new backoff
backoff = helper.Backoff(driverFPBackoffBaseline, driverFPBackoffLimit, retry)
retry++
continue
}
cancel()
eventsCh = newEventsChan
cancel = newCancel
// Reset backoff
backoff = 0
retry = 0
}
}
}
// handleEvent looks up the event handler(s) for the event and runs them
func (i *instanceManager) handleEvent(ev *drivers.TaskEvent) {
// Do not emit that the plugin is shutdown
if ev.Err != nil && ev.Err == bstructs.ErrPluginShutdown {
return
}
if handler := i.eventHandlerFactory(ev.AllocID, ev.TaskName); handler != nil {
i.logger.Trace("task event received", "event", ev)
handler(ev)
return
}
i.logger.Warn("no handler registered for event", "event", ev, "error", ev.Err)
}