pytorch-labs
diff --git a/‎helion/_compiler/compile_environment.py
Lines changed: 7 additions & 2 deletions b/‎helion/_compiler/compile_environment.py
Lines changed: 7 additions & 2 deletions
diff --git a/‎helion/_compiler/device_function.py
Lines changed: 110 additions & 0 deletions b/‎helion/_compiler/device_function.py
Lines changed: 110 additions & 0 deletions
diff --git a/‎helion/_compiler/device_ir.py
Lines changed: 187 additions & 2 deletions b/‎helion/_compiler/device_ir.py
Lines changed: 187 additions & 2 deletions
@@ -222,10 +222,15 @@ def to_fake(self, obj: object, origin: Origin) -> object:
             ),
         ):
             return obj
-        if isinstance(obj, types.FunctionType):
+        # Handle functions and Kernel objects
+        from ..runtime.kernel import Kernel
+
+        if isinstance(obj, (types.FunctionType, Kernel)):
+            from .helper_function import extract_helper_function
             from .lift_closures import lift_closures
 
-            return lift_closures(obj, origin)
+            fn = extract_helper_function(obj)
+            return lift_closures(fn, origin)
         if isinstance(obj, ConstExpr):
             return obj.value
         if isinstance(obj, list):
 
@@ -37,6 +37,7 @@
 
 if TYPE_CHECKING:
     from ..runtime.config import Config
+    from .device_ir import HelperFunctionGraphInfo
     from .generate_ast import GenerateAST
     from .program_id import ProgramIDs
 
@@ -185,6 +186,8 @@ def __init__(self, name: str, config: Config, codegen: GenerateAST) -> None:
         self.block_size_var_cache: dict[tuple[int, ...], str] = {}
         self.expr_to_var_info: dict[sympy.Expr, VarInfo] = {}
 
+        self.helper_functions: dict[str, HelperFunctionGraphInfo] = {}
+
         from .indexing_strategy import IndexingStrategy
         from .tile_dispatch import TileStrategyDispatch
 
@@ -488,6 +491,113 @@ def dead_code_elimination(self) -> None:
                     if v.name in args_to_remove:
                         del cache[k]
 
+    def register_helper_function(
+        self, helper_graph_info: HelperFunctionGraphInfo
+    ) -> None:
+        """Register a helper function to be generated at global scope."""
+        self.helper_functions[helper_graph_info.name] = helper_graph_info
+
+    def codegen_helper_functions(self) -> list[ast.stmt]:
+        """Generate helper function definitions at global scope."""
+        helper_defs = []
+        for helper_graph_info in self.helper_functions.values():
+            # Determine the number of parameters from the graph
+            input_nodes = helper_graph_info.find_input_nodes()
+
+            # Generate argument list with consistent names
+            args = []
+            param_names = []
+            for i in range(len(input_nodes)):
+                arg_name = f"param_{i}"
+                args.append(create_arg(arg_name))
+                param_names.append(arg_name)
+
+            # Store parameter names for use in body generation
+            helper_graph_info._param_names = param_names
+
+            # Process the FX graph to generate the correct helper function body
+            func_body = self._codegen_helper_function_body(helper_graph_info)
+
+            # Generate the function structure with @triton.jit decorator
+            func_def = create(
+                ast.FunctionDef,
+                name=helper_graph_info.name,
+                args=create_arguments(args),
+                body=func_body,
+                decorator_list=[expr_from_string("triton.jit")],
+                type_params=[],
+            )
+
+            helper_defs.append(func_def)
+
+        return helper_defs
+
+    def _codegen_helper_function_body(
+        self, helper_graph_info: HelperFunctionGraphInfo
+    ) -> list[ast.stmt]:
+        """Generate the body of a helper function by processing its FX graph."""
+        temp_device_function = self._create_temp_device_function(helper_graph_info)
+        param_args = self._create_parameter_args(helper_graph_info)
+
+        with temp_device_function:
+            results = self._process_helper_graph(
+                helper_graph_info, temp_device_function, param_args
+            )
+            statements = temp_device_function.body.copy()
+            self._ensure_return_statement(statements, results, helper_graph_info.name)
+
+        return cast("list[ast.stmt]", statements)
+
+    def _create_temp_device_function(
+        self, helper_graph_info: HelperFunctionGraphInfo
+    ) -> DeviceFunction:
+        """Create a temporary DeviceFunction for helper function generation."""
+        return DeviceFunction(
+            name=f"temp_{helper_graph_info.name}",
+            config=self.config,
+            codegen=self.codegen,
+        )
+
+    def _create_parameter_args(
+        self, helper_graph_info: HelperFunctionGraphInfo
+    ) -> list[ast.AST]:
+        """Create parameter AST nodes for the helper function."""
+        param_names = helper_graph_info._param_names
+        return [expr_from_string(param_name) for param_name in param_names]
+
+    def _process_helper_graph(
+        self,
+        helper_graph_info: HelperFunctionGraphInfo,
+        temp_device_function: DeviceFunction,
+        param_args: list[ast.AST],
+    ) -> object:
+        """Process the graph using the existing interpreter infrastructure."""
+        from .helper_function import HelperCodegen
+        from .inductor_lowering import GraphInterpreter
+
+        helper_codegen = HelperCodegen(temp_device_function)
+        interpreter = GraphInterpreter(helper_graph_info.graph, helper_codegen)
+        return interpreter.run(*param_args)
+
+    def _ensure_return_statement(
+        self, statements: list[ast.AST], results: object, function_name: str
+    ) -> None:
+        """Ensure the function body has a proper return statement."""
+        if statements and isinstance(statements[-1], ast.Return):
+            return
+
+        if isinstance(results, ast.AST):
+            statements.append(create(ast.Return, value=results))
+        elif isinstance(results, (list, tuple)) and all(
+            isinstance(r, ast.AST) for r in results
+        ):
+            tuple_ast = create(ast.Tuple, elts=list(results), ctx=ast.Load())
+            statements.append(create(ast.Return, value=tuple_ast))
+        else:
+            raise RuntimeError(
+                f"Helper function {function_name} produced invalid result: {type(results)} {results}"
+            )
+
     def __enter__(self) -> None:
         try:
             tls.functions.append(self)
 
@@ -2,7 +2,6 @@
 
 import ast
 import builtins
-from collections.abc import Callable
 import contextlib
 import dataclasses
 import functools
@@ -339,7 +338,11 @@ def build_rolled_reductions(self) -> None:
             for graph_id, graph_info in enumerate([*self.graphs]):
                 assert graph_id == graph_info.graph_id
                 roller = ReductionRoller(self, rdim, graph_to_info)
-                new_graph = roller.process(graph_info.graph)
+                try:
+                    new_graph = roller.process(graph_info.graph)
+                except NotImplementedError:
+                    first = False
+                    break
                 new_graph_id = self.add_graph(
                     new_graph, type(graph_info), **graph_info.kwargs()
                 )
@@ -807,9 +810,160 @@ def visit_Call(self, node: ast.Call) -> object:
         else:
             func = self.visit(node.func)
 
+        # Special handling for associative_scan
+        if isinstance(
+            (func_type_info := node.func._type_info),
+            CallableType,
+        ) and (
+            func_type_info.value is hl.associative_scan or func is hl.associative_scan
+        ):
+            return self._handle_associative_scan(node, args, kwargs)
+
         # pyre-ignore[6]
         return _CheckForIndexCalls.retry_call(func, args, kwargs)
 
+    def _handle_associative_scan(
+        self, node: ast.Call, args: list[object], kwargs: dict[str, object]
+    ) -> object:
+        """Handle associative_scan calls by tracing the combine function as a subgraph."""
+        from ..language import _tracing_ops
+
+        combine_fn: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = cast(
+            "Callable[[torch.Tensor, torch.Tensor], torch.Tensor]", args[0]
+        )  # The combine function
+        input_tensor = args[1]  # The input tensor
+
+        # Extract other arguments from kwargs
+        dim = kwargs.get("dim", 0)
+        reverse = kwargs.get("reverse", False)
+
+        # Detect if we're dealing with tuple inputs
+        is_tuple_input = isinstance(input_tensor, (tuple, list))
+
+        # Create a subgraph for the combine function
+        if is_tuple_input:
+
+            def run_combine_subgraph(
+                *args: torch.Tensor,
+            ) -> tuple[torch.Tensor, ...]:
+                # This will trace the combine function with unpacked tuple inputs
+                from .helper_function import extract_helper_function
+
+                # For tuple inputs, the combine function expects unpacked arguments
+                # args = [left_val1, left_val2, ..., right_val1, right_val2, ...]
+                # We need to call: combine_fn(left_val1, left_val2, ..., right_val1, right_val2, ...)
+                actual_fn = extract_helper_function(combine_fn)
+                result = actual_fn(*args)
+                return result if isinstance(result, tuple) else (result,)
+        else:
+
+            def run_combine_subgraph(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+                # This will trace the combine function with tensor inputs
+                from .helper_function import extract_helper_function
+
+                actual_fn = extract_helper_function(combine_fn)
+                return actual_fn(a, b)
+
+        # Create fake inputs for the combine function
+        if is_tuple_input:
+            # Handle tuple inputs
+            if isinstance(input_tensor, (tuple, list)) and all(
+                isinstance(t, torch.Tensor) for t in input_tensor
+            ):
+                fake_inputs = []
+                for tensor in input_tensor:
+                    fake_inputs.extend(
+                        [
+                            torch.empty([1], dtype=tensor.dtype, device=tensor.device),
+                            torch.empty([1], dtype=tensor.dtype, device=tensor.device),
+                        ]
+                    )
+                fake_a_and_b = fake_inputs
+            else:
+                # Fallback for when input_tensor is a proxy tuple
+                # Assume 2 tensors of float32 for now
+                fake_a_and_b = [torch.empty([1], dtype=torch.float32) for _ in range(4)]
+        else:
+            # Handle single tensor inputs
+            if isinstance(input_tensor, torch.Tensor):
+                fake_a = torch.empty(
+                    [1], dtype=input_tensor.dtype, device=input_tensor.device
+                )
+                fake_b = torch.empty(
+                    [1], dtype=input_tensor.dtype, device=input_tensor.device
+                )
+                fake_a_and_b = [fake_a, fake_b]
+            else:
+                # Fallback for when input_tensor is a proxy
+                fake_a = torch.empty([1], dtype=torch.float32)
+                fake_b = torch.empty([1], dtype=torch.float32)
+                fake_a_and_b = [fake_a, fake_b]
+
+        with self.disable_tracing() as tracer:
+            combine_graph = proxy_tensor.make_fx(
+                run_combine_subgraph, decomposition_table=select_decomp_table()
+            )(*fake_a_and_b).graph
+
+            combine_graph_id = self.device_ir.add_graph(
+                combine_graph,
+                HelperFunctionGraphInfo,
+                node_args=[],  # The combine function doesn't use external args
+            )
+
+            # Create the associative_scan tracing operation
+            scan_args = (
+                combine_graph_id,
+                input_tensor,
+                dim,
+                reverse,
+                is_tuple_input,
+            )
+
+            proxy_args, proxy_kwargs = args_to_proxies(tracer, scan_args)
+            proxy_out = tracer.create_proxy(
+                "call_function",
+                _tracing_ops._associative_scan,
+                proxy_args,
+                proxy_kwargs,
+            )
+
+            # The output has the same shape as the input
+            if is_tuple_input:
+                # For tuple inputs, track each element separately and return a tuple
+                proxy_tensor.track_tensor_tree(
+                    input_tensor,
+                    proxy_out,
+                    constant=None,
+                    tracer=tracer,
+                )
+                # Convert the proxy output to a tuple of individual proxies
+                tuple_proxies = []
+                assert isinstance(
+                    input_tensor, (tuple, list)
+                )  # Guaranteed when is_tuple_input is True
+                for i, tensor in enumerate(input_tensor):
+                    element_proxy = tracer.create_proxy(
+                        "call_function",
+                        operator.getitem,
+                        (proxy_out, i),
+                        {},
+                    )
+                    proxy_tensor.track_tensor_tree(
+                        tensor,
+                        element_proxy,
+                        constant=None,
+                        tracer=tracer,
+                    )
+                    tuple_proxies.append(tensor)
+                return tuple(tuple_proxies)
+            proxy_tensor.track_tensor_tree(
+                input_tensor,
+                proxy_out,
+                constant=None,
+                tracer=tracer,
+            )
+            return proxy_out
+
     def visit_Attribute(self, node: ast.Attribute) -> object:
         return getattr(self.visit(node.value), node.attr)
 
@@ -898,6 +1052,37 @@ def lower_to_device_ir(func: HostFunction) -> DeviceIR:
         return device_ir
 
 
+@dataclasses.dataclass
+class HelperFunctionGraphInfo(NodeArgsGraphInfo):
+    """Graph info for helper functions in higher-order operations like associative_scan."""
+
+    _param_names: list[str] = dataclasses.field(default_factory=list)
+
+    @property
+    def name(self) -> str:
+        return f"helper_function_{self.graph_id}"
+
+    def find_input_nodes(self) -> list[torch.fx.Node]:
+        """Find all placeholder nodes (inputs) in the graph."""
+        return self.graph.find_nodes(op="placeholder")
+
+    def codegen(self, state: CodegenState) -> list[object]:
+        # For helper functions, we need to inline the function body
+        # The helper function takes variable arguments and returns their combination
+
+        # Generate temporary variable names for the helper function arguments
+        # Use the graph's input nodes to determine the number of parameters
+        input_nodes = self.find_input_nodes()
+        args: list[ast.AST] = []
+
+        for i in range(len(input_nodes)):
+            var_name = state.codegen.tmpvar(prefix=f"helper_arg_{i}")
+            args.append(create(ast.Name, id=var_name, ctx=ast.Load()))
+
+        # Generate the helper function call
+        return codegen_call_with_graph(state.codegen, self.graph, args)
+
+
 def remove_unnecessary_tile_index(graph: torch.fx.Graph) -> None:
     """
     Remove unnecessary tile_index nodes from the graph.