Add hl.signal

joydddd · joydddd · commit 915fd9f6695f · 2025-07-01T16:02:24.000-07:00
stack-info: PR: #233, branch: joydddd/stack/8
diff --git a/helion/_triton_ext/gmem_barrier.py b/helion/_triton_ext/gmem_barrier.py
@@ -5,6 +5,49 @@
 import triton.language as tl
 
 
+@triton.jit
+def _triton_send_signal(
+    addr,  # can be a scalar or a vector of pointers.
+    update: tl.constexpr,
+    sem: tl.constexpr,
+    scope: tl.constexpr,
+    op: tl.constexpr,
+    skip_sync: tl.constexpr,
+) -> None:
+    """
+    Send a signal to a global memory barrier.
+
+    This function implements a spin-wait loop that continuously checks a memory location
+    until it reaches the expected value, providing synchronization across GPU threads.
+
+    Args:
+        addr: Memory address of the barrier to wait on (Must be a scalar)
+        expect: Expected value to wait for
+        update: Update
+    """
+    if not skip_sync:
+        tl.inline_asm_elementwise(
+            "bar.sync 0;", "=r", [], dtype=tl.int32, is_pure=False, pack=1
+        )
+
+    tl.static_assert(
+        sem == "release" or sem == "relaxed",
+        "Invalid memory semantic. options: 'release', 'relaxed'. ",
+    )
+    tl.static_assert(
+        scope == "gpu" or scope == "sys", "Invalid scope. options: 'gpu','sys'. "
+    )
+
+    if op == "atomic_xchg":
+        tl.atomic_xchg(addr, update, sem=sem, scope=scope)
+    elif op == "atomic_add":
+        tl.atomic_add(addr, update, sem=sem, scope=scope)
+    else:
+        raise NotImplementedError(
+            f"Unsupported op '{op}' for send signal on gmem barrier. "
+        )
+
+
 @triton.jit
 def _triton_wait_signal(
     addr,
diff --git a/helion/language/signal_wait.py b/helion/language/signal_wait.py
@@ -12,6 +12,9 @@
     import ast
 
     from .._compiler.inductor_lowering import CodegenState
+    from helion._compiler.type_propagation import SymIntType
+
+__all__ = ["signal", "wait"]
 
 
 @has_side_effect
@@ -195,3 +198,30 @@ def _(state: CodegenState) -> ast.AST:
         signal=signal_expr,
         update=update_expr,
     )
+
+
+@has_side_effect
+@_decorators.api(tiles_as_sizes=True)
+def signal(
+    signal_pad: torch.Tensor,
+    index: list[object],
+    signal: int = 1,
+    op: str = "atomic_xchg",
+    sem: str = "release",
+    scope: str = "gpu",
+    skip_sync: bool = False,
+) -> torch.Tensor | SymIntType:
+    raise exc.NotInsideKernel
+
+
+@_decorators.register_fake(signal)
+def _(
+    signal_pad: torch.Tensor,
+    index: list[object],
+    signal: int = 1,
+    op: str = "atomic_xchg",
+    sem: str = "release",
+    scope: str = "gpu",
+    skip_sync: bool = False,
+) -> torch.Tensor:
+    return tensor.new_empty(SubscriptIndexing.compute_shape(tensor, index))
diff --git a/test/test_signal_wait.py b/test/test_signal_wait.py
@@ -101,6 +101,34 @@ def _wait_for_2d_tile_kernel_make_precompiler(signal_pad: torch.Tensor, x: torch
             code,
         )
 
+    def test_basic_signal(self):
+        @helion.kernel
+        def gmem_signal_kernel(signal_pad: torch.Tensor) -> torch.Tensor:
+            (n,) = signal_pad.shape
+            for i in hl.grid(n):
+                hl.signal(signal_pad, [i], signal=1)
+            return signal_pad
+
+        signal_pad = torch.ones(4, device=DEVICE, dtype=torch.int32)
+        code, result = code_and_output(gmem_wait_kernel, (signal_pad,))
+        torch.testing.assert_close(
+            result, torch.ones(4, device=DEVICE, dtype=torch.int32)
+        )
+        self.maxDiff = None
+        self.assertIn(
+            "from helion import _triton_ext as hl_ext", code
+        )  # Import hl_ext.
+        self.assertIn(
+            """\
+@triton.jit
+def _gmem_wait_kernel_kernel(signal_pad, out, out_stride_0, signal_pad_stride_0):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0
+    hl_ext._triton_wait_signal(addr=signal_pad + offset_0 * signal_pad_stride_0, expect=1, update=0, sem='acquire', scope='gpu', op='ld', skip_sync=False)
+    tl.store(out + offset_0 * out_stride_0, offset_0, None)""",
+            code,
+        )
+
 
 if __name__ == "__main__":
     unittest.main()