()
| 79 | |
| 80 | |
| 81 | def set_accelerator_visible(): |
| 82 | cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", None) |
| 83 | xdist_worker_id = get_xdist_worker_id() |
| 84 | if xdist_worker_id is None: |
| 85 | xdist_worker_id = 0 |
| 86 | if cuda_visible is None: |
| 87 | # CUDA_VISIBLE_DEVICES is not set, discover it using accelerator specific command instead |
| 88 | if get_accelerator().device_name() == 'cuda': |
| 89 | if is_rocm_pytorch(): |
| 90 | rocm_smi = subprocess.check_output(['rocm-smi', '--showid']) |
| 91 | gpu_ids = filter(lambda s: 'GPU' in s, rocm_smi.decode('utf-8').strip().split('\n')) |
| 92 | num_accelerators = len(list(gpu_ids)) |
| 93 | else: |
| 94 | nvidia_smi = subprocess.check_output(['nvidia-smi', '--list-gpus']) |
| 95 | num_accelerators = len(nvidia_smi.decode('utf-8').strip().split('\n')) |
| 96 | elif get_accelerator().device_name() == 'xpu': |
| 97 | clinfo = subprocess.check_output(['clinfo']) |
| 98 | lines = clinfo.decode('utf-8').strip().split('\n') |
| 99 | num_accelerators = 0 |
| 100 | for line in lines: |
| 101 | match = re.search('Device Type.*GPU', line) |
| 102 | if match: |
| 103 | num_accelerators += 1 |
| 104 | elif get_accelerator().device_name() == 'hpu': |
| 105 | try: |
| 106 | hl_smi = subprocess.check_output(['hl-smi', "-L"]) |
| 107 | num_accelerators = re.findall(r"Module ID\s+:\s+(\d+)", hl_smi.decode()) |
| 108 | except FileNotFoundError: |
| 109 | sim_list = subprocess.check_output(['ls', '-1', '/dev/accel']) |
| 110 | num_accelerators = re.findall(r"accel(\d+)", sim_list.decode()) |
| 111 | num_accelerators = sorted(num_accelerators, key=int) |
| 112 | os.environ["HABANA_VISIBLE_MODULES"] = ",".join(num_accelerators) |
| 113 | elif get_accelerator().device_name() == 'npu': |
| 114 | npu_smi = subprocess.check_output(['npu-smi', 'info', '-l']) |
| 115 | num_accelerators = int(npu_smi.decode('utf-8').strip().split('\n')[0].split(':')[1].strip()) |
| 116 | elif get_accelerator().device_name() == 'supa': |
| 117 | br_smi = subprocess.check_output(['brsmi', 'gpu', 'list']) |
| 118 | gpu_ids = filter(lambda s: 'GPU' in s, br_smi.decode('utf-8').strip().split('\n')) |
| 119 | num_accelerators = len(list(gpu_ids)) |
| 120 | else: |
| 121 | assert get_accelerator().device_name() == 'cpu' |
| 122 | num_accelerators = _get_cpu_socket_count() |
| 123 | |
| 124 | if isinstance(num_accelerators, list): |
| 125 | cuda_visible = ",".join(num_accelerators) |
| 126 | else: |
| 127 | cuda_visible = ",".join(map(str, range(num_accelerators))) |
| 128 | |
| 129 | # rotate list based on xdist worker id, example below |
| 130 | # wid=0 -> ['0', '1', '2', '3'] |
| 131 | # wid=1 -> ['1', '2', '3', '0'] |
| 132 | # wid=2 -> ['2', '3', '0', '1'] |
| 133 | # wid=3 -> ['3', '0', '1', '2'] |
| 134 | dev_id_list = cuda_visible.split(",") |
| 135 | dev_id_list = dev_id_list[xdist_worker_id:] + dev_id_list[:xdist_worker_id] |
| 136 | os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(dev_id_list) |
| 137 | |
| 138 |
no test coverage detected
searching dependent graphs…