Detect the devices available. If artificial devices are configured, prefers those, otherwise, we detect cuda, rocm, cpu (or no) devices based on the configured slot type.
(slotType, agentID, visibleGPUs string, artificialSlots int)
| 17 | // Detect the devices available. If artificial devices are configured, prefers those, otherwise, |
| 18 | // we detect cuda, rocm, cpu (or no) devices based on the configured slot type. |
| 19 | func Detect(slotType, agentID, visibleGPUs string, artificialSlots int) ([]device.Device, error) { |
| 20 | // Log detected nvidia version. |
| 21 | v, err := getNvidiaVersion() |
| 22 | if err != nil { |
| 23 | return nil, fmt.Errorf("failed to get nvidia version: %w", err) |
| 24 | } else if v != "" { |
| 25 | log.Infof("Nvidia driver version: %s", v) |
| 26 | } |
| 27 | |
| 28 | // Log detected rocm version. |
| 29 | v, err = getRocmVersion() |
| 30 | if err != nil { |
| 31 | return nil, fmt.Errorf("failed to get rocm version: %w", err) |
| 32 | } else if v != "" { |
| 33 | log.Infof("Rocm driver version: %s", v) |
| 34 | } |
| 35 | |
| 36 | // Detect devices available to the agent. |
| 37 | var detected []device.Device |
| 38 | switch { |
| 39 | case artificialSlots > 0: |
| 40 | // Generate random UUIDs consistent across agent restarts as long as |
| 41 | // agentID is the same. |
| 42 | rnd, sErr := randFromString(agentID) |
| 43 | if sErr != nil { |
| 44 | return nil, sErr |
| 45 | } |
| 46 | |
| 47 | for i := 0; i < artificialSlots; i++ { |
| 48 | u, rErr := uuid.NewRandomFromReader(rnd) |
| 49 | if rErr != nil { |
| 50 | return nil, rErr |
| 51 | } |
| 52 | id := u.String() |
| 53 | detected = append(detected, device.Device{ |
| 54 | ID: device.ID(i), Brand: "Artificial", UUID: id, Type: device.CPU, |
| 55 | }) |
| 56 | } |
| 57 | case slotType == "none": |
| 58 | detected = []device.Device{} |
| 59 | case slotType == "cuda" || slotType == "gpu": |
| 60 | // Support "gpu" for backwards compatibility. |
| 61 | detected, err = detectCudaGPUs(visibleGPUs) |
| 62 | if err != nil { |
| 63 | return nil, errors.Wrap( |
| 64 | err, |
| 65 | "error while gathering GPU info through nvidia-smi command", |
| 66 | ) |
| 67 | } |
| 68 | case slotType == "rocm": |
| 69 | detected, err = detectRocmGPUs(visibleGPUs) |
| 70 | if err != nil { |
| 71 | return nil, errors.Wrap(err, "error while gathering GPU info through rocm-smi command") |
| 72 | } |
| 73 | case slotType == "cpu": |
| 74 | detected, err = detectCPUs() |
| 75 | if err != nil { |
| 76 | return nil, err |
no test coverage detected