open-nomad/plugins/device/device.go

230 lines
6.0 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package device
import (
"context"
"fmt"
"time"
multierror "github.com/hashicorp/go-multierror"
"github.com/hashicorp/nomad/plugins/base"
"github.com/hashicorp/nomad/plugins/shared/structs"
)
const (
// DeviceTypeGPU is a canonical device type for a GPU.
DeviceTypeGPU = "gpu"
)
var (
// ErrPluginDisabled indicates that the device plugin is disabled
ErrPluginDisabled = fmt.Errorf("device is not enabled")
)
// DevicePlugin is the interface for a plugin that can expose detected devices
// to Nomad and inform it how to mount them.
type DevicePlugin interface {
base.BasePlugin
// Fingerprint returns a stream of devices that are detected.
Fingerprint(ctx context.Context) (<-chan *FingerprintResponse, error)
// Reserve is used to reserve a set of devices and retrieve mount
// instructions.
Reserve(deviceIDs []string) (*ContainerReservation, error)
// Stats returns a stream of statistics per device collected at the passed
// interval.
Stats(ctx context.Context, interval time.Duration) (<-chan *StatsResponse, error)
}
// FingerprintResponse includes a set of detected devices or an error in the
// process of fingerprinting.
type FingerprintResponse struct {
// Devices is a set of devices that have been detected.
Devices []*DeviceGroup
// Error is populated when fingerprinting has failed.
Error error
}
// NewFingerprint takes a set of device groups and returns a fingerprint
// response
func NewFingerprint(devices ...*DeviceGroup) *FingerprintResponse {
return &FingerprintResponse{
Devices: devices,
}
}
// NewFingerprintError takes an error and returns a fingerprint response
func NewFingerprintError(err error) *FingerprintResponse {
return &FingerprintResponse{
Error: err,
}
}
// DeviceGroup is a grouping of devices that share a common vendor, device type
// and name.
type DeviceGroup struct {
// Vendor is the vendor providing the device (nvidia, intel, etc).
Vendor string
// Type is the type of the device (gpu, fpga, etc).
Type string
// Name is the devices model name.
Name string
// Devices is the set of device instances.
Devices []*Device
// Attributes are a set of attributes shared for all the devices.
Attributes map[string]*structs.Attribute
}
// Validate validates that the device group is valid
func (d *DeviceGroup) Validate() error {
var mErr multierror.Error
if d.Vendor == "" {
_ = multierror.Append(&mErr, fmt.Errorf("device vendor must be specified"))
}
if d.Type == "" {
_ = multierror.Append(&mErr, fmt.Errorf("device type must be specified"))
}
if d.Name == "" {
_ = multierror.Append(&mErr, fmt.Errorf("device name must be specified"))
}
for i, dev := range d.Devices {
if dev == nil {
_ = multierror.Append(&mErr, fmt.Errorf("device %d is nil", i))
continue
}
if err := dev.Validate(); err != nil {
_ = multierror.Append(&mErr, multierror.Prefix(err, fmt.Sprintf("device %d: ", i)))
}
}
for k, v := range d.Attributes {
if err := v.Validate(); err != nil {
_ = multierror.Append(&mErr, fmt.Errorf("device attribute %q invalid: %v", k, err))
}
}
return mErr.ErrorOrNil()
}
// Device is an instance of a particular device.
type Device struct {
// ID is the identifier for the device.
ID string
// Healthy marks whether the device is healthy and can be used for
// scheduling.
Healthy bool
// HealthDesc describes why the device may be unhealthy.
HealthDesc string
// HwLocality captures hardware locality information for the device.
HwLocality *DeviceLocality
}
// Validate validates that the device is valid
func (d *Device) Validate() error {
if d.ID == "" {
return fmt.Errorf("device ID must be specified")
}
return nil
}
// DeviceLocality captures hardware locality information for a device.
type DeviceLocality struct {
// PciBusID is the PCI bus ID of the device.
PciBusID string
}
// ContainerReservation describes how to mount a device into a container. A
// container is an isolated environment that shares the host's OS.
type ContainerReservation struct {
// Envs are a set of environment variables to set for the task.
Envs map[string]string
// Mounts are used to mount host volumes into a container that may include
// libraries, etc.
Mounts []*Mount
// Devices are the set of devices to mount into the container.
Devices []*DeviceSpec
}
// Mount is used to mount a host directory into a container.
type Mount struct {
// TaskPath is the location in the task's file system to mount.
TaskPath string
// HostPath is the host directory path to mount.
HostPath string
// ReadOnly defines whether the mount should be read only to the task.
ReadOnly bool
}
// DeviceSpec captures how to mount a device into a container.
type DeviceSpec struct {
// TaskPath is the location to mount the device in the task's file system.
TaskPath string
// HostPath is the host location of the device.
HostPath string
// CgroupPerms defines the permissions to use when mounting the device.
CgroupPerms string
}
// StatsResponse returns statistics for each device group.
type StatsResponse struct {
// Groups contains statistics for each device group.
Groups []*DeviceGroupStats
// Error is populated when collecting statistics has failed.
Error error
}
// NewStatsError takes an error and returns a stats response
func NewStatsError(err error) *StatsResponse {
return &StatsResponse{
Error: err,
}
}
// DeviceGroupStats contains statistics for each device of a particular
// device group, identified by the vendor, type and name of the device.
type DeviceGroupStats struct {
Vendor string
Type string
Name string
// InstanceStats is a mapping of each device ID to its statistics.
InstanceStats map[string]*DeviceStats
}
// DeviceStats is the statistics for an individual device
type DeviceStats struct {
// Summary exposes a single summary metric that should be the most
// informative to users.
Summary *structs.StatValue
// Stats contains the verbose statistics for the device.
Stats *structs.StatObject
// Timestamp is the time the statistics were collected.
Timestamp time.Time
}