()
| 84 | ) |
| 85 | # borrowed from PyTorch, test/test_cuda.py: test_record_stream() |
| 86 | def test_record_stream_ndarray(): |
| 87 | cycles_per_ms = _get_cycles_per_ms() |
| 88 | |
| 89 | t = nd.array(np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32), ctx=nd.cpu()) |
| 90 | t.pin_memory_() |
| 91 | result = nd.empty([4], ctx=nd.gpu(0)) |
| 92 | stream = torch.cuda.Stream() |
| 93 | ptr = [None] |
| 94 | |
| 95 | # Performs the CPU->GPU copy in a background stream |
| 96 | def perform_copy(): |
| 97 | with torch.cuda.stream(stream): |
| 98 | tmp = t.copyto(nd.gpu(0)) |
| 99 | ptr[0] = F.from_dgl_nd(tmp).data_ptr() |
| 100 | torch.cuda.current_stream().wait_stream(stream) |
| 101 | tmp.record_stream(to_dgl_stream_handle(torch.cuda.current_stream())) |
| 102 | torch.cuda._sleep(int(50 * cycles_per_ms)) # delay the copy |
| 103 | result.copyfrom(tmp) |
| 104 | |
| 105 | perform_copy() |
| 106 | with torch.cuda.stream(stream): |
| 107 | tmp2 = nd.empty([4], ctx=nd.gpu(0)) |
| 108 | assert ( |
| 109 | F.from_dgl_nd(tmp2).data_ptr() != ptr[0] |
| 110 | ), "allocation re-used too soon" |
| 111 | |
| 112 | assert torch.equal( |
| 113 | F.from_dgl_nd(result).cpu(), torch.tensor([1.0, 2.0, 3.0, 4.0]) |
| 114 | ) |
| 115 | |
| 116 | # Check that the block will be re-used after the main stream finishes |
| 117 | torch.cuda.current_stream().synchronize() |
| 118 | with torch.cuda.stream(stream): |
| 119 | tmp3 = nd.empty([4], ctx=nd.gpu(0)) |
| 120 | assert ( |
| 121 | F.from_dgl_nd(tmp3).data_ptr() == ptr[0] |
| 122 | ), "allocation not re-used" |
| 123 | |
| 124 | |
| 125 | @unittest.skipIf( |
no test coverage detected