|
1 | 1 | This file is automatically generated by assertExpectedJournal calls in test_signal_wait.py.
|
2 | 2 | Update expected outputs by running tests with the EXPECTTEST_ACCEPT=1 environment variable set.
|
3 | 3 |
|
| 4 | +--- assertExpectedJournal(TestWait.test_global_sync) |
| 5 | +from __future__ import annotations |
| 6 | + |
| 7 | +import torch |
| 8 | +import helion |
| 9 | +import triton |
| 10 | +import triton.language as tl |
| 11 | + |
| 12 | +@triton.jit |
| 13 | +def _gmem_multi_bar_sync_kernel_kernel(signal_pad, signal_pad_stride_0, signal_pad_stride_1, N, _BLOCK_SIZE_1: tl.constexpr): |
| 14 | + pid_0 = tl.program_id(0) |
| 15 | + offset_0 = pid_0 |
| 16 | + for offset_1 in tl.range(0, N.to(tl.int32), step=_BLOCK_SIZE_1): |
| 17 | + indices_1 = offset_1 + tl.arange(0, _BLOCK_SIZE_1).to(tl.int32) |
| 18 | + helion.runtime.triton_send_signal(addr=signal_pad + (indices_1 * signal_pad_stride_0 + offset_0 * signal_pad_stride_1), update=1, sem='release', scope='gpu', op='atomic_xchg', skip_sync=True) |
| 19 | + helion.runtime.triton_wait_multiple_signal(addr=signal_pad + (offset_0 * signal_pad_stride_0 + indices_1 * signal_pad_stride_1), expect=1, update=0, sem='acquire', scope='gpu', op='ld', skip_sync=False) |
| 20 | + |
| 21 | +def gmem_multi_bar_sync_kernel(signal_pad: torch.Tensor): |
| 22 | + M, N = signal_pad.shape |
| 23 | + assert M == N |
| 24 | + _BLOCK_SIZE_1 = N |
| 25 | + _gmem_multi_bar_sync_kernel_kernel[N,](signal_pad, signal_pad.stride(0), signal_pad.stride(1), N, _BLOCK_SIZE_1, num_warps=4, num_stages=3) |
| 26 | + return signal_pad |
| 27 | + |
| 28 | +def _gmem_multi_bar_sync_kernel_make_precompiler(signal_pad: torch.Tensor): |
| 29 | + M, N = signal_pad.shape |
| 30 | + assert M == N |
| 31 | + _BLOCK_SIZE_1 = N |
| 32 | + from helion.runtime.precompile_shim import make_precompiler |
| 33 | + return make_precompiler(_gmem_multi_bar_sync_kernel_kernel)(signal_pad, signal_pad.stride(0), signal_pad.stride(1), N, _BLOCK_SIZE_1, num_warps=4, num_stages=3) |
| 34 | + |
4 | 35 | --- assertExpectedJournal(TestWait.test_signal_basic)
|
5 | 36 | from __future__ import annotations
|
6 | 37 |
|
@@ -76,6 +107,33 @@ def _gmem_signal_tensor_bar_kernel_make_precompiler(signal_pad: torch.Tensor):
|
76 | 107 | from helion.runtime.precompile_shim import make_precompiler
|
77 | 108 | return make_precompiler(_gmem_signal_tensor_bar_kernel_kernel)(signal_pad, signal_pad.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
|
78 | 109 |
|
| 110 | +--- assertExpectedJournal(TestWait.test_signal_multiple_cas) |
| 111 | +from __future__ import annotations |
| 112 | + |
| 113 | +import torch |
| 114 | +import helion |
| 115 | +import triton |
| 116 | +import triton.language as tl |
| 117 | + |
| 118 | +@triton.jit |
| 119 | +def _gmem_signal_tensor_bar_kernel_kernel(signal_pad, signal_pad_stride_0, _BLOCK_SIZE_0: tl.constexpr): |
| 120 | + pid_0 = tl.program_id(0) |
| 121 | + offset_0 = pid_0 * _BLOCK_SIZE_0 |
| 122 | + indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32) |
| 123 | + helion.runtime.triton_wait_multiple_signal(addr=signal_pad + indices_0 * signal_pad_stride_0, expect=0, update=1, sem='release', scope='gpu', op='atomic_cas', skip_sync=True, sync_before=not False) |
| 124 | + |
| 125 | +def gmem_signal_tensor_bar_kernel(signal_pad: torch.Tensor): |
| 126 | + n, = signal_pad.shape |
| 127 | + _BLOCK_SIZE_0 = 4 |
| 128 | + _gmem_signal_tensor_bar_kernel_kernel[triton.cdiv(n, _BLOCK_SIZE_0),](signal_pad, signal_pad.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3) |
| 129 | + return signal_pad |
| 130 | + |
| 131 | +def _gmem_signal_tensor_bar_kernel_make_precompiler(signal_pad: torch.Tensor): |
| 132 | + n, = signal_pad.shape |
| 133 | + _BLOCK_SIZE_0 = 4 |
| 134 | + from helion.runtime.precompile_shim import make_precompiler |
| 135 | + return make_precompiler(_gmem_signal_tensor_bar_kernel_kernel)(signal_pad, signal_pad.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3) |
| 136 | + |
79 | 137 | --- assertExpectedJournal(TestWait.test_wait_2d_tile)
|
80 | 138 | from __future__ import annotations
|
81 | 139 |
|
@@ -144,3 +202,67 @@ def _gmem_wait_kernel_make_precompiler(signal_pad: torch.Tensor):
|
144 | 202 | from helion.runtime.precompile_shim import make_precompiler
|
145 | 203 | return make_precompiler(_gmem_wait_kernel_kernel)(signal_pad, out, out.stride(0), signal_pad.stride(0), num_warps=4, num_stages=3)
|
146 | 204 |
|
| 205 | +--- assertExpectedJournal(TestWait.test_wait_multi_bar) |
| 206 | +from __future__ import annotations |
| 207 | + |
| 208 | +import torch |
| 209 | +import helion |
| 210 | +import triton |
| 211 | +import triton.language as tl |
| 212 | + |
| 213 | +import __main__ as _source_module |
| 214 | + |
| 215 | +@triton.jit |
| 216 | +def _gmem_wait_multi_bar_kernel_kernel(signal_pad, out, out_stride_0, signal_pad_stride_0, _BLOCK_SIZE_0: tl.constexpr): |
| 217 | + pid_0 = tl.program_id(0) |
| 218 | + offset_0 = pid_0 * _BLOCK_SIZE_0 |
| 219 | + indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32) |
| 220 | + helion.runtime.triton_wait_multiple_signal(addr=signal_pad + indices_0 * signal_pad_stride_0, expect=1, update=0, sem='acquire', scope='gpu', op='ld', skip_sync=False) |
| 221 | + tile_id = offset_0 // _BLOCK_SIZE_0 |
| 222 | + tl.store(out + tile_id * out_stride_0, tile_id, None) |
| 223 | + |
| 224 | +def gmem_wait_multi_bar_kernel(signal_pad: torch.Tensor): |
| 225 | + N, = signal_pad.shape |
| 226 | + n = 4 |
| 227 | + out = torch.empty(n, dtype=torch.int32, device=_source_module.DEVICE) |
| 228 | + _BLOCK_SIZE_0 = 4 |
| 229 | + _gmem_wait_multi_bar_kernel_kernel[triton.cdiv(N, _BLOCK_SIZE_0),](signal_pad, out, out.stride(0), signal_pad.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3) |
| 230 | + return out |
| 231 | + |
| 232 | +def _gmem_wait_multi_bar_kernel_make_precompiler(signal_pad: torch.Tensor): |
| 233 | + N, = signal_pad.shape |
| 234 | + n = 4 |
| 235 | + out = torch.empty(n, dtype=torch.int32, device=_source_module.DEVICE) |
| 236 | + _BLOCK_SIZE_0 = 4 |
| 237 | + from helion.runtime.precompile_shim import make_precompiler |
| 238 | + return make_precompiler(_gmem_wait_multi_bar_kernel_kernel)(signal_pad, out, out.stride(0), signal_pad.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3) |
| 239 | + |
| 240 | +--- assertExpectedJournal(TestWait.test_wait_multi_bar_cas) |
| 241 | +from __future__ import annotations |
| 242 | + |
| 243 | +import torch |
| 244 | +import helion |
| 245 | +import triton |
| 246 | +import triton.language as tl |
| 247 | + |
| 248 | +@triton.jit |
| 249 | +def _gmem_wait_multi_bar_kernel_cas_kernel(signal_pad, signal_pad_stride_0, _BLOCK_SIZE_0: tl.constexpr): |
| 250 | + pid_0 = tl.program_id(0) |
| 251 | + offset_0 = pid_0 * _BLOCK_SIZE_0 |
| 252 | + indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32) |
| 253 | + helion.runtime.triton_wait_multiple_signal(addr=signal_pad + indices_0 * signal_pad_stride_0, expect=1, update=2, sem='acquire', scope='gpu', op='atomic_cas', skip_sync=False) |
| 254 | + |
| 255 | +def gmem_wait_multi_bar_kernel_cas(signal_pad: torch.Tensor): |
| 256 | + N, = signal_pad.shape |
| 257 | + n = 4 |
| 258 | + _BLOCK_SIZE_0 = 4 |
| 259 | + _gmem_wait_multi_bar_kernel_cas_kernel[triton.cdiv(N, _BLOCK_SIZE_0),](signal_pad, signal_pad.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3) |
| 260 | + return signal_pad |
| 261 | + |
| 262 | +def _gmem_wait_multi_bar_kernel_cas_make_precompiler(signal_pad: torch.Tensor): |
| 263 | + N, = signal_pad.shape |
| 264 | + n = 4 |
| 265 | + _BLOCK_SIZE_0 = 4 |
| 266 | + from helion.runtime.precompile_shim import make_precompiler |
| 267 | + return make_precompiler(_gmem_wait_multi_bar_kernel_cas_kernel)(signal_pad, signal_pad.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3) |
| 268 | + |
0 commit comments