| 389 | |
| 390 | |
| 391 | def _parse_nvidia_smi_stats() -> list[dict[str, Any]]: |
| 392 | import subprocess |
| 393 | |
| 394 | try: |
| 395 | result = subprocess.run( |
| 396 | _NVIDIA_GPU_STATS_CMD, |
| 397 | capture_output=True, |
| 398 | text=True, |
| 399 | check=True, |
| 400 | ) |
| 401 | except (subprocess.SubprocessError, FileNotFoundError) as e: |
| 402 | LOGGER.warning("Failed to extract Nvidia GPU stats: %s", e) |
| 403 | return [] |
| 404 | stats: list[dict[str, Any]] = [] |
| 405 | for line in result.stdout.strip().split("\n"): |
| 406 | if not line: |
| 407 | continue |
| 408 | index_str, name, total_str, used_str, free_str = line.split(", ") |
| 409 | # This is what you get on a DGX Spark |
| 410 | if total_str == "[N/A]": |
| 411 | total_str = "0" |
| 412 | if used_str == "[N/A]": |
| 413 | used_str = "0" |
| 414 | if free_str == "[N/A]": |
| 415 | free_str = "0" |
| 416 | total = int(total_str) * 1024 * 1024 # Convert MB to bytes |
| 417 | used = int(used_str) * 1024 * 1024 |
| 418 | free = int(free_str) * 1024 * 1024 |
| 419 | stats.append( |
| 420 | { |
| 421 | "index": int(index_str), |
| 422 | "name": name.strip(), |
| 423 | "memory": { |
| 424 | "total": total, |
| 425 | "used": used, |
| 426 | "free": free, |
| 427 | "percent": (used / total) * 100 if total > 0 else 0, |
| 428 | }, |
| 429 | } |
| 430 | ) |
| 431 | return stats |
| 432 | |
| 433 | |
| 434 | def _parse_rocm_smi_stats() -> list[dict[str, Any]]: |