1
0
mirror of https://github.com/lxc/incus.git synced 2026-02-06 12:46:34 +01:00
Files
incus/shared/resources/gpu.go
Stéphane Graber f5ef7a8402 shared/resources: Restrict to Linux
Signed-off-by: Stéphane Graber <stgraber@stgraber.org>
2025-08-12 15:57:04 -04:00

602 lines
15 KiB
Go

//go:build linux
package resources
import (
"bufio"
"encoding/csv"
"errors"
"fmt"
"io"
"os"
"os/exec"
"path/filepath"
"slices"
"strconv"
"strings"
"github.com/jaypipes/pcidb"
"golang.org/x/sys/unix"
"github.com/lxc/incus/v6/shared/api"
)
var (
sysClassDrm = "/sys/class/drm"
procDriverNvidia = "/proc/driver/nvidia"
)
func loadNvidiaProc() (map[string]*api.ResourcesGPUCardNvidia, error) {
nvidiaCards := map[string]*api.ResourcesGPUCardNvidia{}
gpusPath := filepath.Join(procDriverNvidia, "gpus")
if !sysfsExists(gpusPath) {
return nil, errors.New("No NVIDIA GPU proc driver")
}
// List the GPUs from /proc
entries, err := os.ReadDir(gpusPath)
if err != nil {
return nil, fmt.Errorf("Failed to list %q: %w", gpusPath, err)
}
for _, entry := range entries {
entryName := entry.Name()
entryPath := filepath.Join(gpusPath, entryName)
if !sysfsExists(filepath.Join(entryPath, "information")) {
continue
}
// Get the GPU information
f, err := os.Open(filepath.Join(entryPath, "information"))
if err != nil {
return nil, fmt.Errorf("Failed to open %q: %w", filepath.Join(entryPath, "information"), err)
}
gpuInfo := bufio.NewScanner(f)
nvidiaCard := &api.ResourcesGPUCardNvidia{}
for gpuInfo.Scan() {
line := strings.TrimSpace(gpuInfo.Text())
fields := strings.SplitN(line, ":", 2)
if len(fields) != 2 {
continue
}
key := strings.TrimSpace(fields[0])
value := strings.TrimSpace(fields[1])
if key == "Model" {
nvidiaCard.Model = value
nvidiaCard.Brand = strings.Split(value, " ")[0]
}
if key == "Device Minor" {
nvidiaCard.CardName = fmt.Sprintf("nvidia%s", value)
nvidiaCard.CardDevice = fmt.Sprintf("195:%s", value)
}
}
err = f.Close()
if err != nil {
return nil, fmt.Errorf("Failed to close %q: %w", filepath.Join(entryPath, "information"), err)
}
nvidiaCards[entryName] = nvidiaCard
}
return nvidiaCards, nil
}
func loadNvidiaContainer() (map[string]*api.ResourcesGPUCardNvidia, error) {
// Check for nvidia-container-cli
_, err := exec.LookPath("nvidia-container-cli")
if err != nil {
return nil, fmt.Errorf("Failed to locate nvidia-container-cli: %w", err)
}
// Prepare nvidia-container-cli call
cmd := exec.Command("nvidia-container-cli", "info", "--csv")
outPipe, err := cmd.StdoutPipe()
if err != nil {
return nil, fmt.Errorf("Failed to setup PIPE for nvidia-container-cli: %w", err)
}
// Run the command
err = cmd.Start()
if err != nil {
return nil, fmt.Errorf("Failed to start nvidia-container-cli: %w", err)
}
// Parse the data
r := csv.NewReader(outPipe)
r.FieldsPerRecord = -1
nvidiaCards := map[string]*api.ResourcesGPUCardNvidia{}
nvidiaNVRM := ""
nvidiaCUDA := ""
line := 0
for {
record, err := r.Read()
if err == io.EOF {
break
}
line++
if err != nil {
continue
}
if line == 2 && len(record) >= 2 {
nvidiaNVRM = record[0]
nvidiaCUDA = record[1]
} else if line >= 4 {
nvidiaCards[record[5]] = &api.ResourcesGPUCardNvidia{
NVRMVersion: nvidiaNVRM,
CUDAVersion: nvidiaCUDA,
Brand: record[3],
Model: record[2],
UUID: record[4],
Architecture: record[6],
CardName: fmt.Sprintf("nvidia%s", record[1]),
CardDevice: fmt.Sprintf("195:%s", record[1]),
}
}
}
// wait for nvidia-container-cli
err = cmd.Wait()
if err != nil {
return nil, fmt.Errorf("nvidia-container-cli failed: %w", err)
}
return nvidiaCards, nil
}
func gpuAddDeviceInfo(devicePath string, nvidiaCards map[string]*api.ResourcesGPUCardNvidia, pciDB *pcidb.PCIDB, uname unix.Utsname, card *api.ResourcesGPUCard) error {
// Handle nested devices.
if isDir(filepath.Join(devicePath, "device")) {
return gpuAddDeviceInfo(filepath.Join(devicePath, "device"), nvidiaCards, pciDB, uname, card)
}
// SRIOV
if sysfsExists(filepath.Join(devicePath, "sriov_numvfs")) {
sriov := api.ResourcesGPUCardSRIOV{}
// Get maximum and current VF count
vfMaximum, err := readUint(filepath.Join(devicePath, "sriov_totalvfs"))
if err != nil {
return fmt.Errorf("Failed to read %q: %w", filepath.Join(devicePath, "sriov_totalvfs"), err)
}
vfCurrent, err := readUint(filepath.Join(devicePath, "sriov_numvfs"))
if err != nil {
return fmt.Errorf("Failed to read %q: %w", filepath.Join(devicePath, "sriov_numvfs"), err)
}
sriov.MaximumVFs = vfMaximum
sriov.CurrentVFs = vfCurrent
// Add the SRIOV data to the card
card.SRIOV = &sriov
}
// NUMA node
if sysfsExists(filepath.Join(devicePath, "numa_node")) {
numaNode, err := readInt(filepath.Join(devicePath, "numa_node"))
if err != nil {
return fmt.Errorf("Failed to read %q: %w", filepath.Join(devicePath, "numa_node"), err)
}
if numaNode > 0 {
card.NUMANode = uint64(numaNode)
}
}
deviceUSBPath := filepath.Join(devicePath, "device", "busnum")
if sysfsExists(deviceUSBPath) {
// USB address
deviceDevicePath := filepath.Join(devicePath, "device")
usbAddr, err := usbAddress(deviceDevicePath)
if err != nil {
return fmt.Errorf("Failed to find USB address for %q: %w", devicePath, err)
}
if usbAddr != "" {
card.USBAddress = usbAddr
}
} else {
// Vendor and product
deviceVendorPath := filepath.Join(devicePath, "vendor")
if sysfsExists(deviceVendorPath) {
id, err := os.ReadFile(deviceVendorPath)
if err != nil {
return fmt.Errorf("Failed to read %q: %w", deviceVendorPath, err)
}
card.VendorID = strings.TrimPrefix(strings.TrimSpace(string(id)), "0x")
}
deviceDevicePath := filepath.Join(devicePath, "device")
if sysfsExists(deviceDevicePath) {
id, err := os.ReadFile(deviceDevicePath)
if err != nil {
return fmt.Errorf("Failed to read %q: %w", deviceDevicePath, err)
}
card.ProductID = strings.TrimPrefix(strings.TrimSpace(string(id)), "0x")
}
// Fill vendor and product names
if pciDB != nil {
vendor, ok := pciDB.Vendors[card.VendorID]
if ok {
card.Vendor = vendor.Name
for _, product := range vendor.Products {
if product.ID == card.ProductID {
card.Product = product.Name
break
}
}
}
}
}
// Driver information
driverPath := filepath.Join(devicePath, "driver")
if sysfsExists(driverPath) {
linkTarget, err := filepath.EvalSymlinks(driverPath)
if err != nil {
return fmt.Errorf("Failed to find %q: %w", driverPath, err)
}
// Set the driver name
card.Driver = filepath.Base(linkTarget)
// Try to get the version, fallback to kernel version
out, err := os.ReadFile(filepath.Join(driverPath, "module", "version"))
if err == nil {
card.DriverVersion = strings.TrimSpace(string(out))
} else {
card.DriverVersion = strings.TrimRight(string(uname.Release[:]), "\x00")
}
}
// NVIDIA specific stuff
if card.Driver == "nvidia" && card.PCIAddress != "" {
nvidia, ok := nvidiaCards[card.PCIAddress]
if ok {
card.Nvidia = nvidia
} else {
nvidia, ok := nvidiaCards[fmt.Sprintf("0000%s", card.PCIAddress)]
if ok {
card.Nvidia = nvidia
}
}
}
// DRM information
drmPath := filepath.Join(devicePath, "drm")
if sysfsExists(drmPath) {
drm := api.ResourcesGPUCardDRM{}
// List all the devices
entries, err := os.ReadDir(drmPath)
if err != nil {
return fmt.Errorf("Failed to list %q: %w", drmPath, err)
}
// Fill in the struct
for _, entry := range entries {
entryName := entry.Name()
entryPath := filepath.Join(drmPath, entryName)
after, ok := strings.CutPrefix(entryName, "card")
if ok {
// Get the card ID
idStr := after
id, err := strconv.ParseUint(idStr, 10, 64)
if err != nil {
return fmt.Errorf("Failed to parse card number: %w", err)
}
dev, err := os.ReadFile(filepath.Join(entryPath, "dev"))
if err != nil {
return fmt.Errorf("Failed to read %q: %w", filepath.Join(entryPath, "dev"), err)
}
drm.ID = id
drm.CardName = entryName
drm.CardDevice = strings.TrimSpace(string(dev))
}
if strings.HasPrefix(entryName, "controlD") {
dev, err := os.ReadFile(filepath.Join(entryPath, "dev"))
if err != nil {
return fmt.Errorf("Failed to read %q: %w", filepath.Join(entryPath, "dev"), err)
}
drm.ControlName = entryName
drm.ControlDevice = strings.TrimSpace(string(dev))
}
if strings.HasPrefix(entryName, "renderD") {
dev, err := os.ReadFile(filepath.Join(entryPath, "dev"))
if err != nil {
return fmt.Errorf("Failed to read %q: %w", filepath.Join(entryPath, "dev"), err)
}
drm.RenderName = entryName
drm.RenderDevice = strings.TrimSpace(string(dev))
}
}
card.DRM = &drm
}
// DRM information
mdevPath := filepath.Join(devicePath, "mdev_supported_types")
if sysfsExists(mdevPath) {
card.Mdev = map[string]api.ResourcesGPUCardMdev{}
// List all the devices
entries, err := os.ReadDir(mdevPath)
if err != nil {
return fmt.Errorf("Failed to list %q: %w", mdevPath, err)
}
// Fill in the struct
for _, entry := range entries {
mdev := api.ResourcesGPUCardMdev{}
entryName := entry.Name()
entryPath := filepath.Join(mdevPath, entryName)
// API
apiPath := filepath.Join(entryPath, "device_api")
if sysfsExists(apiPath) {
deviceAPI, err := os.ReadFile(apiPath)
if err != nil {
return fmt.Errorf("Failed to read %q: %w", apiPath, err)
}
mdev.API = strings.TrimSpace(string(deviceAPI))
}
// Available
availablePath := filepath.Join(entryPath, "available_instances")
if sysfsExists(availablePath) {
available, err := readUint(availablePath)
if err != nil {
return fmt.Errorf("Failed to read %q: %w", availablePath, err)
}
mdev.Available = available
}
// Description
descriptionPath := filepath.Join(entryPath, "description")
if sysfsExists(descriptionPath) {
description, err := os.ReadFile(descriptionPath)
if err != nil {
return fmt.Errorf("Failed to read %q: %w", descriptionPath, err)
}
mdev.Description = strings.TrimSpace(string(description))
}
// Devices
mdevDevicesPath := filepath.Join(entryPath, "devices")
if sysfsExists(mdevDevicesPath) {
devs, err := os.ReadDir(mdevDevicesPath)
if err != nil {
return fmt.Errorf("Failed to list %q: %w", mdevDevicesPath, err)
}
mdev.Devices = []string{}
for _, dev := range devs {
mdev.Devices = append(mdev.Devices, dev.Name())
}
}
// Name
namePath := filepath.Join(entryPath, "name")
if sysfsExists(namePath) {
name, err := os.ReadFile(namePath)
if err != nil {
return fmt.Errorf("Failed to read %q: %w", namePath, err)
}
mdev.Name = strings.TrimSpace(string(name))
}
card.Mdev[entryName] = mdev
}
}
return nil
}
// GetGPU returns a filled api.ResourcesGPU struct ready for use by Incus.
func GetGPU() (*api.ResourcesGPU, error) {
gpu := api.ResourcesGPU{}
gpu.Cards = []api.ResourcesGPUCard{}
// Get uname for driver version
uname := unix.Utsname{}
err := unix.Uname(&uname)
if err != nil {
return nil, fmt.Errorf("Failed to get uname: %w", err)
}
// Load PCI database
pciDB, err := pcidb.New()
if err != nil {
pciDB = nil
}
// Load NVIDIA information
nvidiaCards, err := loadNvidiaContainer()
if err != nil {
nvidiaCards, err = loadNvidiaProc()
if err != nil {
nvidiaCards = map[string]*api.ResourcesGPUCardNvidia{}
}
}
// Temporary variables
pciKnown := []string{}
pciVFs := map[string][]api.ResourcesGPUCard{}
// Detect all GPUs available through kernel drm interface
if sysfsExists(sysClassDrm) {
entries, err := os.ReadDir(sysClassDrm)
if err != nil {
return nil, fmt.Errorf("Failed to list %q: %w", sysClassDrm, err)
}
// Iterate and add to our list
for _, entry := range entries {
entryName := entry.Name()
entryPath := filepath.Join(sysClassDrm, entryName)
devicePath := filepath.Join(entryPath, "device")
// Only care about cards not renderers
if !strings.HasPrefix(entryName, "card") {
continue
}
// Only keep the main entries not sub-cards
if !sysfsExists(filepath.Join(entryPath, "dev")) {
continue
}
// Setup the entry
card := api.ResourcesGPUCard{}
// PCI address.
pciAddr, err := pciAddress(devicePath)
if err != nil {
return nil, fmt.Errorf("Failed to find PCI address for %q: %w", devicePath, err)
}
if pciAddr != "" {
card.PCIAddress = pciAddr
// Skip devices we already know about
if slices.Contains(pciKnown, card.PCIAddress) {
continue
}
pciKnown = append(pciKnown, card.PCIAddress)
}
// Add device information
err = gpuAddDeviceInfo(devicePath, nvidiaCards, pciDB, uname, &card)
if err != nil {
return nil, fmt.Errorf("Failed to add device information for %q: %w", devicePath, err)
}
// Add to list
if sysfsExists(filepath.Join(devicePath, "physfn")) {
// Virtual functions need to be added to the parent
linkTarget, err := filepath.EvalSymlinks(filepath.Join(devicePath, "physfn"))
if err != nil {
return nil, fmt.Errorf("Failed to find %q: %w", filepath.Join(devicePath, "physfn"), err)
}
parentAddress := filepath.Base(linkTarget)
_, ok := pciVFs[parentAddress]
if !ok {
pciVFs[parentAddress] = []api.ResourcesGPUCard{}
}
pciVFs[parentAddress] = append(pciVFs[parentAddress], card)
} else {
gpu.Cards = append(gpu.Cards, card)
}
}
}
// Detect remaining GPUs on PCI bus
if sysfsExists(sysBusPci) {
entries, err := os.ReadDir(sysBusPci)
if err != nil {
return nil, fmt.Errorf("Failed to list %q: %w", sysBusPci, err)
}
// Iterate and add to our list
for _, entry := range entries {
entryName := entry.Name()
devicePath := filepath.Join(sysBusPci, entryName)
// Skip devices we already know about
if slices.Contains(pciKnown, entryName) {
continue
}
// Only care about identifiable devices
if !sysfsExists(filepath.Join(devicePath, "class")) {
continue
}
class, err := os.ReadFile(filepath.Join(devicePath, "class"))
if err != nil {
return nil, fmt.Errorf("Failed to read %q: %w", filepath.Join(devicePath, "class"), err)
}
// Only care about VGA devices
if !strings.HasPrefix(string(class), "0x03") {
continue
}
// Start building up data
card := api.ResourcesGPUCard{}
card.PCIAddress = entryName
// Add device information
err = gpuAddDeviceInfo(devicePath, nvidiaCards, pciDB, uname, &card)
if err != nil {
return nil, fmt.Errorf("Failed to add device information for %q: %w", devicePath, err)
}
// Add to list
if sysfsExists(filepath.Join(devicePath, "physfn")) {
// Virtual functions need to be added to the parent
linkTarget, err := filepath.EvalSymlinks(filepath.Join(devicePath, "physfn"))
if err != nil {
return nil, fmt.Errorf("Failed to find %q: %w", filepath.Join(devicePath, "physfn"), err)
}
parentAddress := filepath.Base(linkTarget)
_, ok := pciVFs[parentAddress]
if !ok {
pciVFs[parentAddress] = []api.ResourcesGPUCard{}
}
pciVFs[parentAddress] = append(pciVFs[parentAddress], card)
} else {
gpu.Cards = append(gpu.Cards, card)
}
}
}
// Add SRIOV devices and count devices
gpu.Total = 0
for _, card := range gpu.Cards {
if card.SRIOV != nil {
card.SRIOV.VFs = pciVFs[card.PCIAddress]
gpu.Total += uint64(len(card.SRIOV.VFs))
}
gpu.Total++
}
return &gpu, nil
}