| 11 | |
| 12 | |
| 13 | class AsyncIOBuilder(TorchCPUOpBuilder): |
| 14 | BUILD_VAR = "DS_BUILD_AIO" |
| 15 | NAME = "async_io" |
| 16 | |
| 17 | def __init__(self): |
| 18 | super().__init__(name=self.NAME) |
| 19 | |
| 20 | def absolute_name(self): |
| 21 | return f'deepspeed.ops.aio.{self.NAME}_op' |
| 22 | |
| 23 | def lib_sources(self): |
| 24 | src_list = [ |
| 25 | 'csrc/aio/py_lib/deepspeed_py_io_handle.cpp', 'csrc/aio/py_lib/deepspeed_py_aio.cpp', |
| 26 | 'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp', 'csrc/aio/py_lib/deepspeed_aio_thread.cpp', |
| 27 | 'csrc/aio/common/deepspeed_aio_utils.cpp', 'csrc/aio/common/deepspeed_aio_common.cpp', |
| 28 | 'csrc/aio/common/deepspeed_aio_types.cpp', 'csrc/aio/py_lib/deepspeed_cpu_op.cpp', |
| 29 | 'csrc/aio/py_lib/deepspeed_aio_op_desc.cpp', 'csrc/aio/py_lib/deepspeed_py_copy.cpp', |
| 30 | 'csrc/aio/py_lib/deepspeed_pin_tensor.cpp' |
| 31 | ] |
| 32 | return src_list |
| 33 | |
| 34 | def sources(self): |
| 35 | return self.lib_sources() + ['csrc/aio/py_lib/py_ds_aio.cpp'] |
| 36 | |
| 37 | def include_paths(self): |
| 38 | import torch |
| 39 | if self.build_for_cpu: |
| 40 | CUDA_INCLUDE = [] |
| 41 | elif not self.is_rocm_pytorch(): |
| 42 | CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")] |
| 43 | else: |
| 44 | CUDA_INCLUDE = [ |
| 45 | os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include"), |
| 46 | os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "rocrand"), |
| 47 | os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "hiprand"), |
| 48 | ] |
| 49 | return ['csrc/aio/py_lib', 'csrc/aio/common'] + CUDA_INCLUDE |
| 50 | |
| 51 | def cxx_args(self): |
| 52 | # -O0 for improved debugging, since performance is bound by I/O |
| 53 | args = super().cxx_args() |
| 54 | import torch |
| 55 | TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[0:2]) |
| 56 | if not (TORCH_MAJOR >= 2 and TORCH_MINOR >= 1): |
| 57 | args.remove('-std=c++17') |
| 58 | args.append('-std=c++14') |
| 59 | args += ['-Wall', '-O0', '-shared', '-fPIC', '-Wno-reorder'] |
| 60 | return args |
| 61 | |
| 62 | def extra_ldflags(self): |
| 63 | if self.build_for_cpu: |
| 64 | return ['-fopenmp'] |
| 65 | |
| 66 | import torch.utils.cpp_extension |
| 67 | CUDA_HOME = torch.utils.cpp_extension.CUDA_HOME |
| 68 | if CUDA_HOME is None: |
| 69 | ldflags = ['-laio'] # the ROCM case |
| 70 | else: |
no outgoing calls
searching dependent graphs…