Move register_block_size/register_reduction_dim to tunable_ops.py (#161)

jansel · web-flow · commit ce3b6c77067b · 2025-06-12T16:51:45.000-07:00
diff --git a/helion/language/__init__.py b/helion/language/__init__.py
@@ -7,8 +7,6 @@
 from .device_print import device_print as device_print
 from .loops import Tile as Tile
 from .loops import grid as grid
-from .loops import register_block_size as register_block_size
-from .loops import register_reduction_dim as register_reduction_dim
 from .loops import tile as tile
 from .memory_ops import atomic_add as atomic_add
 from .memory_ops import load as load
@@ -17,5 +15,7 @@
 from .tiles import tile_block_size as tile_block_size
 from .tiles import tile_end as tile_end
 from .tiles import tile_index as tile_index
+from .tunable_ops import register_block_size as register_block_size
+from .tunable_ops import register_reduction_dim as register_reduction_dim
 from .tunable_ops import register_tunable as register_tunable
 from .view_ops import subscript as subscript
diff --git a/helion/language/loops.py b/helion/language/loops.py
@@ -8,15 +8,12 @@
 from typing import overload
 
 import torch
-from torch._inductor.codegen.simd import constant_repr
-from torch._inductor.runtime.runtime_utils import next_power_of_2
 from torch._inductor.runtime.triton_heuristics import get_max_y_grid
 
 from .. import exc
 from .._compiler.ast_extension import ExtendedAST
 from .._compiler.ast_extension import LoopType
 from .._compiler.ast_extension import expr_from_string
-from .._compiler.compile_environment import AutoSize
 from .._compiler.compile_environment import CompileEnvironment
 from .._compiler.tile_index_proxy import TileIndexProxy
 from .._compiler.type_propagation import GridIndexType
@@ -26,7 +23,6 @@
 from .._compiler.type_propagation import TileIndexType
 from .._compiler.type_propagation import TypeInfo
 from .._compiler.type_propagation import UnknownType
-from ..autotuner.config_fragment import assert_integer_power_of_two
 from ..autotuner.config_spec import ConfigSpec
 from ..autotuner.config_spec import FlattenLoopSpec
 from ..autotuner.config_spec import L2GroupingSpec
@@ -39,7 +35,7 @@
     from .._compiler.inductor_lowering import CodegenState
 
 
-__all__ = ["Tile", "grid", "register_block_size", "register_reduction_dim", "tile"]
+__all__ = ["Tile", "grid", "tile"]
 Tile = TileIndexProxy
 
 
@@ -372,130 +368,3 @@ def _(state: CodegenState) -> ast.AST:
         state.tile_strategy.codegen_grid(state, block_ids)
         return expr_from_string("None")
     raise AssertionError(f"Expected loop type: {loop_type}")
-
-
-@_decorators.api(is_device_only=False, cache_type=True, tiles_as_sizes=True)
-def register_block_size(min_or_max: int, max_or_none: int | None = None, /) -> int:
-    """
-    Explicitly register a block size that should be autotuned and can be used for
-    allocations and inside hl.tile(..., block_size=...).
-
-    This is useful if you have two loops where you want them to share a block size,
-    or if you need to allocate a kernel tensor before the hl.tile() loop.
-
-    The signature can one of:
-        hl.register_block_size(max)
-        hl.register_block_size(min, max)
-
-    Where min and max are integers that control the range of block_sizes searched by
-    the autotuner.  Max may be a symbolic shape, but min must be a constant integer.
-    """
-    raise exc.NotInsideKernel
-
-
-@_decorators.type_propagation(register_block_size)
-def _(
-    min_or_max: TypeInfo, max_or_none: TypeInfo | None = None, /, *, origin: Origin
-) -> TypeInfo:
-    from .._compiler.type_propagation import SymIntType
-
-    min_type, max_type = _normalize_begin_end(min_or_max, max_or_none, origin=origin)
-    min_proxy = _to_proxy(min_type)
-    max_proxy = _to_proxy(max_type)
-    if not isinstance(max_proxy, (int, torch.SymInt)):
-        raise exc.IncorrectTileUsage(
-            f"expected max to be an integer or size, got {max_proxy!s}"
-        )
-    if not isinstance(min_proxy, int):
-        raise exc.IncorrectTileUsage(
-            f"expected min to be an integer constant, got {min_proxy!s}"
-        )
-    env = CompileEnvironment.current()
-    result = TileIndexType.allocate(AutoSize(), origin)
-    loop_spec = env.config_spec.block_sizes.block_id_lookup(result.block_id)
-    loop_spec.min_size = assert_integer_power_of_two(max(1, min_proxy))
-    loop_spec.max_size = next_power_of_2(env.size_hint(max_proxy))
-    block_id = result.block_id
-    return SymIntType(origin, env.block_sizes[block_id].var)
-
-
-def _block_id_from_state(state: CodegenState) -> int:
-    """Extract the block_id from the current state for nodes hl.register_block_size."""
-    from .._compiler.type_propagation import SymIntType
-
-    env = CompileEnvironment.current()
-    if state.fx_node is not None:
-        val = state.fx_node.meta["val"]
-        assert isinstance(val, SymIntType)
-        block_id = env.get_block_id(val.value)
-        assert block_id is not None
-        return block_id
-    current_node = ExtendedAST.current()[-1]
-    type_info = current_node._type_info
-    assert isinstance(type_info, SymIntType)
-    block_id = env.get_block_id(type_info.value)
-    assert block_id is not None
-    return block_id
-
-
-@_decorators.codegen(register_block_size)
-def _(state: CodegenState) -> ast.AST:
-    env = CompileEnvironment.current()
-    block_size = env.config_spec.block_sizes.config_get(
-        state.config.block_sizes, _block_id_from_state(state)
-    )
-    assert block_size is not None
-    return expr_from_string(constant_repr(block_size))
-
-
-@_decorators.api(is_device_only=False, cache_type=True, tiles_as_sizes=True)
-def register_reduction_dim(
-    size: int,
-) -> int:
-    """
-    Explicitly register a reduction dimension that should be used for reduction operations.
-
-    This is useful when you need to allocate a dimension for reduction that isn't
-    automatically inferred from a slice operation. The registered dimension can be
-    used for allocations and operations that require knowing the reduction size upfront.
-
-    :param size: An integer representing the reduction dimension size.
-    :return: A SymInt object representing the reduction dimension size.
-    """
-    raise exc.NotInsideKernel
-
-
-@_decorators.type_propagation(register_reduction_dim)
-def _(sizes: TypeInfo, *, origin: Origin) -> TypeInfo:
-    from .._compiler.compile_environment import CompileEnvironment
-    from .._compiler.type_propagation import SymIntType
-
-    try:
-        proxy_sizes = sizes.proxy()
-        if not isinstance(proxy_sizes, int | torch.SymInt):
-            raise NotImplementedError
-    except NotImplementedError:
-        raise exc.TypePropagationError(
-            UnknownType(
-                origin,
-                f"register_reduction_dim() expected int or list[int], got {sizes!s}",
-                chained_from=sizes,
-            )
-        ) from None
-
-    env = CompileEnvironment.current()
-
-    rdim = env.allocate_reduction_dimension(proxy_sizes)
-    return SymIntType(origin, rdim.var)
-
-
-@_decorators.codegen(register_reduction_dim)
-def _(state: CodegenState) -> ast.AST:
-    """Generate code for register_reduction_dim - return the size expression"""
-    from .._compiler.type_propagation import SymIntType
-
-    current_node = ExtendedAST.current()[-1]
-    type_info = current_node._type_info
-
-    assert isinstance(type_info, SymIntType)
-    return current_node.args[0]  # pyre-ignore[16]
diff --git a/helion/language/tunable_ops.py b/helion/language/tunable_ops.py
@@ -2,23 +2,160 @@
 
 from typing import TYPE_CHECKING
 
+import torch
 from torch._inductor.codegen.simd import constant_repr
+from torch._inductor.runtime.runtime_utils import next_power_of_2
 
 from .. import exc
+from .._compiler.ast_extension import ExtendedAST
 from .._compiler.ast_extension import expr_from_string
+from .._compiler.compile_environment import AutoSize
+from .._compiler.compile_environment import CompileEnvironment
+from .._compiler.type_propagation import TileIndexType
+from .._compiler.type_propagation import TypeInfo
+from .._compiler.type_propagation import UnknownType
+from .._compiler.type_propagation import _to_proxy
 from ..autotuner.config_fragment import ConfigSpecFragment
+from ..autotuner.config_fragment import assert_integer_power_of_two
 from ..autotuner.config_spec import VALID_KEYS
 from ..exc import NotInsideKernel
 from . import _decorators
+from .loops import _normalize_begin_end
 
 if TYPE_CHECKING:
     import ast
 
     from .._compiler.inductor_lowering import CodegenState
-    from .._compiler.type_propagation import TypeInfo
     from .._compiler.variable_origin import Origin
 
-__all__ = ["register_tunable"]
+__all__ = ["register_block_size", "register_reduction_dim", "register_tunable"]
+
+
+@_decorators.api(is_device_only=False, cache_type=True, tiles_as_sizes=True)
+def register_block_size(min_or_max: int, max_or_none: int | None = None, /) -> int:
+    """
+    Explicitly register a block size that should be autotuned and can be used for
+    allocations and inside hl.tile(..., block_size=...).
+
+    This is useful if you have two loops where you want them to share a block size,
+    or if you need to allocate a kernel tensor before the hl.tile() loop.
+
+    The signature can one of:
+        hl.register_block_size(max)
+        hl.register_block_size(min, max)
+
+    Where min and max are integers that control the range of block_sizes searched by
+    the autotuner.  Max may be a symbolic shape, but min must be a constant integer.
+    """
+    raise exc.NotInsideKernel
+
+
+@_decorators.type_propagation(register_block_size)
+def _(
+    min_or_max: TypeInfo, max_or_none: TypeInfo | None = None, /, *, origin: Origin
+) -> TypeInfo:
+    from .._compiler.type_propagation import SymIntType
+
+    min_type, max_type = _normalize_begin_end(min_or_max, max_or_none, origin=origin)
+    min_proxy = _to_proxy(min_type)
+    max_proxy = _to_proxy(max_type)
+    if not isinstance(max_proxy, (int, torch.SymInt)):
+        raise exc.IncorrectTileUsage(
+            f"expected max to be an integer or size, got {max_proxy!s}"
+        )
+    if not isinstance(min_proxy, int):
+        raise exc.IncorrectTileUsage(
+            f"expected min to be an integer constant, got {min_proxy!s}"
+        )
+    env = CompileEnvironment.current()
+    result = TileIndexType.allocate(AutoSize(), origin)
+    loop_spec = env.config_spec.block_sizes.block_id_lookup(result.block_id)
+    loop_spec.min_size = assert_integer_power_of_two(max(1, min_proxy))
+    loop_spec.max_size = next_power_of_2(env.size_hint(max_proxy))
+    block_id = result.block_id
+    return SymIntType(origin, env.block_sizes[block_id].var)
+
+
+def _block_id_from_state(state: CodegenState) -> int:
+    """Extract the block_id from the current state for nodes hl.register_block_size."""
+    from .._compiler.type_propagation import SymIntType
+
+    env = CompileEnvironment.current()
+    if state.fx_node is not None:
+        val = state.fx_node.meta["val"]
+        assert isinstance(val, SymIntType)
+        block_id = env.get_block_id(val.value)
+        assert block_id is not None
+        return block_id
+    current_node = ExtendedAST.current()[-1]
+    type_info = current_node._type_info
+    assert isinstance(type_info, SymIntType)
+    block_id = env.get_block_id(type_info.value)
+    assert block_id is not None
+    return block_id
+
+
+@_decorators.codegen(register_block_size)
+def _(state: CodegenState) -> ast.AST:
+    env = CompileEnvironment.current()
+    block_size = env.config_spec.block_sizes.config_get(
+        state.config.block_sizes, _block_id_from_state(state)
+    )
+    assert block_size is not None
+    return expr_from_string(constant_repr(block_size))
+
+
+@_decorators.api(is_device_only=False, cache_type=True, tiles_as_sizes=True)
+def register_reduction_dim(
+    size: int,
+) -> int:
+    """
+    Explicitly register a reduction dimension that should be used for reduction operations.
+
+    This is useful when you need to allocate a dimension for reduction that isn't
+    automatically inferred from a slice operation. The registered dimension can be
+    used for allocations and operations that require knowing the reduction size upfront.
+
+    :param size: An integer representing the reduction dimension size.
+    :return: A SymInt object representing the reduction dimension size.
+    """
+    raise exc.NotInsideKernel
+
+
+@_decorators.type_propagation(register_reduction_dim)
+def _(sizes: TypeInfo, *, origin: Origin) -> TypeInfo:
+    from .._compiler.compile_environment import CompileEnvironment
+    from .._compiler.type_propagation import SymIntType
+
+    try:
+        proxy_sizes = sizes.proxy()
+        if not isinstance(proxy_sizes, int | torch.SymInt):
+            raise NotImplementedError
+    except NotImplementedError:
+        raise exc.TypePropagationError(
+            UnknownType(
+                origin,
+                f"register_reduction_dim() expected int or list[int], got {sizes!s}",
+                chained_from=sizes,
+            )
+        ) from None
+
+    env = CompileEnvironment.current()
+
+    rdim = env.allocate_reduction_dimension(proxy_sizes)
+    return SymIntType(origin, rdim.var)
+
+
+@_decorators.codegen(register_reduction_dim)
+def _(state: CodegenState) -> ast.AST:
+    """Generate code for register_reduction_dim - return the size expression"""
+    from .._compiler.type_propagation import SymIntType
+
+    current_node = ExtendedAST.current()[-1]
+    type_info = current_node._type_info
+
+    assert isinstance(type_info, SymIntType)
+    return current_node.args[0]  # pyre-ignore[16]
 
 
 @_decorators.api(is_device_only=False)