open-nomad/devices/gpu/nvidia/nvml/driver_linux.go

86 lines
2.7 KiB
Go
Raw Normal View History

package nvml
import (
"github.com/NVIDIA/gpu-monitoring-tools/bindings/go/nvml"
)
// Initialize nvml library by locating nvml shared object file and calling ldopen
func (n *nvmlDriver) Initialize() error {
return nvml.Init()
}
// Shutdown stops any further interaction with nvml
func (n *nvmlDriver) Shutdown() error {
return nvml.Shutdown()
}
// SystemDriverVersion returns installed driver version
func (n *nvmlDriver) SystemDriverVersion() (string, error) {
return nvml.GetDriverVersion()
}
// DeviceCount reports number of available GPU devices
func (n *nvmlDriver) DeviceCount() (uint, error) {
return nvml.GetDeviceCount()
}
// DeviceInfoByIndex returns DeviceInfo for index GPU in system device list
func (n *nvmlDriver) DeviceInfoByIndex(index uint) (*DeviceInfo, error) {
device, err := nvml.NewDevice(index)
if err != nil {
return nil, err
}
deviceMode, err := device.GetDeviceMode()
if err != nil {
return nil, err
}
return &DeviceInfo{
UUID: device.UUID,
Name: device.Model,
MemoryMiB: device.Memory,
PowerW: device.Power,
BAR1MiB: device.PCI.BAR1,
PCIBandwidthMBPerS: device.PCI.Bandwidth,
PCIBusID: device.PCI.BusID,
CoresClockMHz: device.Clocks.Cores,
MemoryClockMHz: device.Clocks.Memory,
DisplayState: deviceMode.DisplayInfo.Mode.String(),
PersistenceMode: deviceMode.Persistence.String(),
}, nil
}
// DeviceInfoByIndex returns DeviceInfo and DeviceStatus for index GPU in system device list
func (n *nvmlDriver) DeviceInfoAndStatusByIndex(index uint) (*DeviceInfo, *DeviceStatus, error) {
device, err := nvml.NewDevice(index)
if err != nil {
return nil, nil, err
}
status, err := device.Status()
if err != nil {
return nil, nil, err
}
return &DeviceInfo{
UUID: device.UUID,
Name: device.Model,
MemoryMiB: device.Memory,
PowerW: device.Power,
BAR1MiB: device.PCI.BAR1,
PCIBandwidthMBPerS: device.PCI.Bandwidth,
PCIBusID: device.PCI.BusID,
CoresClockMHz: device.Clocks.Cores,
MemoryClockMHz: device.Clocks.Memory,
}, &DeviceStatus{
TemperatureC: status.Temperature,
GPUUtilization: status.Utilization.GPU,
MemoryUtilization: status.Utilization.Memory,
EncoderUtilization: status.Utilization.Encoder,
DecoderUtilization: status.Utilization.Decoder,
UsedMemoryMiB: status.Memory.Global.Used,
ECCErrorsL1Cache: status.Memory.ECCErrors.L1Cache,
ECCErrorsL2Cache: status.Memory.ECCErrors.L2Cache,
ECCErrorsDevice: status.Memory.ECCErrors.Device,
PowerUsageW: status.Power,
BAR1UsedMiB: status.PCI.BAR1Used,
}, nil
}