d71a90c8a4
* Throw away result of multierror.Append When given a *multierror.Error, it is mutated, therefore the return value is not needed. * Simplify MergeMultierrorWarnings, use StringBuilder * Hash.Write() never returns an error * Remove error that was always nil * Remove error from Resources.Add signature When this was originally written it could return an error, but that was refactored away, and callers of it as of today never handle the error. * Throw away results of io.Copy during Bridge * Handle errors when computing node class in test
227 lines
6 KiB
Go
227 lines
6 KiB
Go
package device
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"time"
|
|
|
|
multierror "github.com/hashicorp/go-multierror"
|
|
"github.com/hashicorp/nomad/plugins/base"
|
|
"github.com/hashicorp/nomad/plugins/shared/structs"
|
|
)
|
|
|
|
const (
|
|
// DeviceTypeGPU is a canonical device type for a GPU.
|
|
DeviceTypeGPU = "gpu"
|
|
)
|
|
|
|
var (
|
|
// ErrPluginDisabled indicates that the device plugin is disabled
|
|
ErrPluginDisabled = fmt.Errorf("device is not enabled")
|
|
)
|
|
|
|
// DevicePlugin is the interface for a plugin that can expose detected devices
|
|
// to Nomad and inform it how to mount them.
|
|
type DevicePlugin interface {
|
|
base.BasePlugin
|
|
|
|
// Fingerprint returns a stream of devices that are detected.
|
|
Fingerprint(ctx context.Context) (<-chan *FingerprintResponse, error)
|
|
|
|
// Reserve is used to reserve a set of devices and retrieve mount
|
|
// instructions.
|
|
Reserve(deviceIDs []string) (*ContainerReservation, error)
|
|
|
|
// Stats returns a stream of statistics per device collected at the passed
|
|
// interval.
|
|
Stats(ctx context.Context, interval time.Duration) (<-chan *StatsResponse, error)
|
|
}
|
|
|
|
// FingerprintResponse includes a set of detected devices or an error in the
|
|
// process of fingerprinting.
|
|
type FingerprintResponse struct {
|
|
// Devices is a set of devices that have been detected.
|
|
Devices []*DeviceGroup
|
|
|
|
// Error is populated when fingerprinting has failed.
|
|
Error error
|
|
}
|
|
|
|
// NewFingerprint takes a set of device groups and returns a fingerprint
|
|
// response
|
|
func NewFingerprint(devices ...*DeviceGroup) *FingerprintResponse {
|
|
return &FingerprintResponse{
|
|
Devices: devices,
|
|
}
|
|
}
|
|
|
|
// NewFingerprintError takes an error and returns a fingerprint response
|
|
func NewFingerprintError(err error) *FingerprintResponse {
|
|
return &FingerprintResponse{
|
|
Error: err,
|
|
}
|
|
}
|
|
|
|
// DeviceGroup is a grouping of devices that share a common vendor, device type
|
|
// and name.
|
|
type DeviceGroup struct {
|
|
// Vendor is the vendor providing the device (nvidia, intel, etc).
|
|
Vendor string
|
|
|
|
// Type is the type of the device (gpu, fpga, etc).
|
|
Type string
|
|
|
|
// Name is the devices model name.
|
|
Name string
|
|
|
|
// Devices is the set of device instances.
|
|
Devices []*Device
|
|
|
|
// Attributes are a set of attributes shared for all the devices.
|
|
Attributes map[string]*structs.Attribute
|
|
}
|
|
|
|
// Validate validates that the device group is valid
|
|
func (d *DeviceGroup) Validate() error {
|
|
var mErr multierror.Error
|
|
|
|
if d.Vendor == "" {
|
|
_ = multierror.Append(&mErr, fmt.Errorf("device vendor must be specified"))
|
|
}
|
|
if d.Type == "" {
|
|
_ = multierror.Append(&mErr, fmt.Errorf("device type must be specified"))
|
|
}
|
|
if d.Name == "" {
|
|
_ = multierror.Append(&mErr, fmt.Errorf("device name must be specified"))
|
|
}
|
|
|
|
for i, dev := range d.Devices {
|
|
if dev == nil {
|
|
_ = multierror.Append(&mErr, fmt.Errorf("device %d is nil", i))
|
|
continue
|
|
}
|
|
|
|
if err := dev.Validate(); err != nil {
|
|
_ = multierror.Append(&mErr, multierror.Prefix(err, fmt.Sprintf("device %d: ", i)))
|
|
}
|
|
}
|
|
|
|
for k, v := range d.Attributes {
|
|
if err := v.Validate(); err != nil {
|
|
_ = multierror.Append(&mErr, fmt.Errorf("device attribute %q invalid: %v", k, err))
|
|
}
|
|
}
|
|
|
|
return mErr.ErrorOrNil()
|
|
|
|
}
|
|
|
|
// Device is an instance of a particular device.
|
|
type Device struct {
|
|
// ID is the identifier for the device.
|
|
ID string
|
|
|
|
// Healthy marks whether the device is healthy and can be used for
|
|
// scheduling.
|
|
Healthy bool
|
|
|
|
// HealthDesc describes why the device may be unhealthy.
|
|
HealthDesc string
|
|
|
|
// HwLocality captures hardware locality information for the device.
|
|
HwLocality *DeviceLocality
|
|
}
|
|
|
|
// Validate validates that the device is valid
|
|
func (d *Device) Validate() error {
|
|
if d.ID == "" {
|
|
return fmt.Errorf("device ID must be specified")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// DeviceLocality captures hardware locality information for a device.
|
|
type DeviceLocality struct {
|
|
// PciBusID is the PCI bus ID of the device.
|
|
PciBusID string
|
|
}
|
|
|
|
// ContainerReservation describes how to mount a device into a container. A
|
|
// container is an isolated environment that shares the host's OS.
|
|
type ContainerReservation struct {
|
|
// Envs are a set of environment variables to set for the task.
|
|
Envs map[string]string
|
|
|
|
// Mounts are used to mount host volumes into a container that may include
|
|
// libraries, etc.
|
|
Mounts []*Mount
|
|
|
|
// Devices are the set of devices to mount into the container.
|
|
Devices []*DeviceSpec
|
|
}
|
|
|
|
// Mount is used to mount a host directory into a container.
|
|
type Mount struct {
|
|
// TaskPath is the location in the task's file system to mount.
|
|
TaskPath string
|
|
|
|
// HostPath is the host directory path to mount.
|
|
HostPath string
|
|
|
|
// ReadOnly defines whether the mount should be read only to the task.
|
|
ReadOnly bool
|
|
}
|
|
|
|
// DeviceSpec captures how to mount a device into a container.
|
|
type DeviceSpec struct {
|
|
// TaskPath is the location to mount the device in the task's file system.
|
|
TaskPath string
|
|
|
|
// HostPath is the host location of the device.
|
|
HostPath string
|
|
|
|
// CgroupPerms defines the permissions to use when mounting the device.
|
|
CgroupPerms string
|
|
}
|
|
|
|
// StatsResponse returns statistics for each device group.
|
|
type StatsResponse struct {
|
|
// Groups contains statistics for each device group.
|
|
Groups []*DeviceGroupStats
|
|
|
|
// Error is populated when collecting statistics has failed.
|
|
Error error
|
|
}
|
|
|
|
// NewStatsError takes an error and returns a stats response
|
|
func NewStatsError(err error) *StatsResponse {
|
|
return &StatsResponse{
|
|
Error: err,
|
|
}
|
|
}
|
|
|
|
// DeviceGroupStats contains statistics for each device of a particular
|
|
// device group, identified by the vendor, type and name of the device.
|
|
type DeviceGroupStats struct {
|
|
Vendor string
|
|
Type string
|
|
Name string
|
|
|
|
// InstanceStats is a mapping of each device ID to its statistics.
|
|
InstanceStats map[string]*DeviceStats
|
|
}
|
|
|
|
// DeviceStats is the statistics for an individual device
|
|
type DeviceStats struct {
|
|
// Summary exposes a single summary metric that should be the most
|
|
// informative to users.
|
|
Summary *structs.StatValue
|
|
|
|
// Stats contains the verbose statistics for the device.
|
|
Stats *structs.StatObject
|
|
|
|
// Timestamp is the time the statistics were collected.
|
|
Timestamp time.Time
|
|
}
|