pytorch
diff --git a/‎fbgemm_gpu/experimental/hstu/hstu/__init__.py‎
Lines changed: 30 additions & 13 deletions b/‎fbgemm_gpu/experimental/hstu/hstu/__init__.py‎
Lines changed: 30 additions & 13 deletions
diff --git a/‎fbgemm_gpu/experimental/hstu/hstu/cuda_hstu_attention.py‎
Lines changed: 18 additions & 16 deletions b/‎fbgemm_gpu/experimental/hstu/hstu/cuda_hstu_attention.py‎
Lines changed: 18 additions & 16 deletions
diff --git a/‎fbgemm_gpu/experimental/hstu/src/generate_kernels.py‎
Lines changed: 38 additions & 19 deletions b/‎fbgemm_gpu/experimental/hstu/src/generate_kernels.py‎
Lines changed: 38 additions & 19 deletions
diff --git a/‎fbgemm_gpu/experimental/hstu/src/hstu_ampere/hstu_ops_gpu.cpp‎
Lines changed: 16 additions & 14 deletions b/‎fbgemm_gpu/experimental/hstu/src/hstu_ampere/hstu_ops_gpu.cpp‎
Lines changed: 16 additions & 14 deletions
diff --git a/‎fbgemm_gpu/experimental/hstu/src/hstu_hopper/hstu_bwd_launch_template.h‎
Lines changed: 8 additions & 5 deletions b/‎fbgemm_gpu/experimental/hstu/src/hstu_hopper/hstu_bwd_launch_template.h‎
Lines changed: 8 additions & 5 deletions
@@ -1,13 +1,15 @@
 #!/usr/bin/env python3
-# Copyright (c) 2024, NVIDIA Corporation & AFFILIATES.
-# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Portions Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# Copyright (c) 2024, NVIDIA Corporation & AFFILIATES.
+
 # pyre-strict
 
+import logging
 import os
 
 import torch
@@ -23,15 +25,30 @@
 except Exception:
     open_source: bool = False
 
-# pyre-ignore[16]
-if open_source:
-    torch.ops.load_library(
-        os.path.join(os.path.dirname(__file__), "fbgemm_gpu_experimental_hstu.so")
-    )
-    torch.classes.load_library(
-        os.path.join(os.path.dirname(__file__), "fbgemm_gpu_experimental_hstu.so")
-    )
+if (
+    torch.cuda.is_available()
+    and torch.version.cuda is not None
+    and torch.version.cuda > "12.4"
+):
+    if open_source:
+        torch.ops.load_library(
+            os.path.join(os.path.dirname(__file__), "fbgemm_gpu_experimental_hstu.so")
+        )
+        torch.classes.load_library(
+            os.path.join(os.path.dirname(__file__), "fbgemm_gpu_experimental_hstu.so")
+        )
+    else:
+        torch.ops.load_library(
+            "//deeplearning/fbgemm/fbgemm_gpu/experimental/hstu/src:hstu_ops_gpu_sm80"
+        )
+
+        if torch.cuda.get_device_capability() >= (9, 0):
+            torch.ops.load_library(
+                "//deeplearning/fbgemm/fbgemm_gpu/experimental/hstu/src:hstu_ops_gpu_sm90"
+            )
+
 else:
-    torch.ops.load_library(
-        "//deeplearning/fbgemm/fbgemm_gpu/experimental/hstu:hstu_ops"
-    )
+    logging.warning("CUDA is not available for FBGEMM HSTU")
+
+
+from .cuda_hstu_attention import hstu_attn_varlen_func, HstuAttnVarlenFunc  # noqa: F401
@@ -1,22 +1,23 @@
 #!/usr/bin/env python3
-# Copyright (c) 2024, NVIDIA Corporation & AFFILIATES.
-# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Portions Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# Copyright (c) 2024, NVIDIA Corporation & AFFILIATES.
+
 # pyre-strict
 
-from typing import Tuple
+from typing import Any, Optional, Tuple
 
 import torch
 
 
 class HstuAttnVarlenFunc(torch.autograd.Function):
     @staticmethod
-    def forward(
-        ctx,
+    def forward(  # pyre-ignore[14]
+        ctx,  # pyre-ignore[2]
         q: torch.Tensor,  # need grad
         k: torch.Tensor,  # need grad
         v: torch.Tensor,  # need grad
@@ -29,12 +30,12 @@ def forward(
         target_group_size: int,
         window_size: Tuple[int, int] = (-1, -1),
         alpha: float = 1.0,
-        rab: torch.Tensor = None,  # need grad
+        rab: Optional[torch.Tensor] = None,  # need grad
         has_drab: bool = False,
         is_delta_q: bool = False,
-        descale_q: torch.Tensor = None,
-        descale_k: torch.Tensor = None,
-        descale_v: torch.Tensor = None,
+        descale_q: Optional[torch.Tensor] = None,
+        descale_k: Optional[torch.Tensor] = None,
+        descale_v: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         assert q.dim() == 3, "q shape should be (L, num_heads, head_dim)"
         assert k.dim() == 3, "k shape should be (L, num_heads, head_dim)"
@@ -104,10 +105,10 @@ def forward(
         return out
 
     @staticmethod
-    def backward(
-        ctx,
+    def backward(  # pyre-ignore[14]
+        ctx,  # pyre-ignore[2]
         dout: torch.Tensor,
-        *args: any,
+        *args: Any,
     ) -> tuple[
         torch.Tensor,
         torch.Tensor,
@@ -214,6 +215,7 @@ def backward(
         )
 
 
+# pyre-ignore[3]
 def hstu_attn_varlen_func(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -227,12 +229,12 @@ def hstu_attn_varlen_func(
     target_group_size: int = 1,
     window_size: Tuple[int, int] = (-1, -1),
     alpha: float = 1.0,
-    rab: torch.Tensor = None,
+    rab: Optional[torch.Tensor] = None,
     has_drab: bool = False,
     is_delta_q: bool = False,
-    descale_q: torch.Tensor = None,
-    descale_k: torch.Tensor = None,
-    descale_v: torch.Tensor = None,
+    descale_q: Optional[torch.Tensor] = None,
+    descale_k: Optional[torch.Tensor] = None,
+    descale_v: Optional[torch.Tensor] = None,
 ):
     """
     Arguments:
 
@@ -1,15 +1,21 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
-# Copyright (c) Meta Platforms, Inc. and affiliates.
+#!/usr/bin/env python3
+# Portions Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+
 import itertools
 import os
 
 
-def generate_kernels_ampere():
+def generate_kernels_ampere(install_dir: str):
+    """
+    Generate HSTU kernels for Ampere architecture.
+    """
+
     DTYPE_16 = ["bf16", "fp16"]
     HEAD_DIMENSIONS = [32, 64, 128, 256]
     RAB = ["", "_rab"]
@@ -30,6 +36,8 @@ def generate_kernels_ampere():
         "fp16": "cutlass::half_t",
     }
 
+    os.makedirs(install_dir, exist_ok=True)
+
     ampere_fwd_file_head = """
 /*
  * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
@@ -51,9 +59,7 @@ def generate_kernels_ampere():
     for hdim, dtype, rab, mask in itertools.product(
         HEAD_DIMENSIONS, DTYPE_16, RAB, MASK
     ):
-        file_name = (
-            f"hstu_ampere/instantiations/hstu_fwd_hdim{hdim}_{dtype}{rab}{mask}_sm80.cu"
-        )
+        file_name = f"{install_dir}/hstu_fwd_hdim{hdim}_{dtype}{rab}{mask}_sm80.cu"
         if not os.path.exists(file_name):
             with open(file_name, "w") as f:
                 f.write(
@@ -90,7 +96,7 @@ def generate_kernels_ampere():
     for hdim, dtype, rab_drab, mask in itertools.product(
         HEAD_DIMENSIONS, DTYPE_16, RAB_DRAB, MASK
     ):
-        file_name = f"hstu_ampere/instantiations/hstu_bwd_hdim{hdim}_{dtype}{rab_drab}{mask}_sm80.cu"
+        file_name = f"{install_dir}/hstu_bwd_hdim{hdim}_{dtype}{rab_drab}{mask}_sm80.cu"
         if not os.path.exists(file_name):
             with open(file_name, "w") as f:
                 f.write(
@@ -108,7 +114,11 @@ def generate_kernels_ampere():
                 )
 
 
-def generate_kernels_hopper():
+def generate_kernels_hopper(install_dir: str):
+    """
+    Generate HSTU kernels for Hopper architecture.
+    """
+
     DTYPE_16 = ["bf16", "fp16"]
     HEAD_DIMENSIONS = [32, 64, 128, 256]
     RAB = ["", "_rab"]
@@ -130,6 +140,8 @@ def generate_kernels_hopper():
         "fp16": "cutlass::half_t",
     }
 
+    os.makedirs(install_dir, exist_ok=True)
+
     hopper_fwd_file_head = """
 /*
  * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
@@ -151,9 +163,7 @@ def generate_kernels_hopper():
     for hdim, dtype, rab, mask in itertools.product(
         HEAD_DIMENSIONS, DTYPE_16, RAB, MASK
     ):
-        file_name = (
-            f"hstu_hopper/instantiations/hstu_fwd_hdim{hdim}_{dtype}{rab}{mask}_sm90.cu"
-        )
+        file_name = f"{install_dir}/hstu_fwd_hdim{hdim}_{dtype}{rab}{mask}_sm90.cu"
         if not os.path.exists(file_name):
             with open(file_name, "w") as f:
                 f.write(
@@ -172,9 +182,7 @@ def generate_kernels_hopper():
     for hdim, rab, mask in itertools.product(HEAD_DIMENSIONS, RAB, FP8_MASK):
         if hdim == 32:
             continue
-        file_name = (
-            f"hstu_hopper/instantiations/hstu_fwd_hdim{hdim}_e4m3{rab}{mask}_sm90.cu"
-        )
+        file_name = f"{install_dir}/hstu_fwd_hdim{hdim}_e4m3{rab}{mask}_sm90.cu"
         if not os.path.exists(file_name):
             with open(file_name, "w") as f:
                 f.write(
@@ -211,7 +219,7 @@ def generate_kernels_hopper():
     for hdim, dtype, rab_drab, mask in itertools.product(
         HEAD_DIMENSIONS, DTYPE_16, RAB_DRAB, MASK
     ):
-        file_name = f"hstu_hopper/instantiations/hstu_bwd_hdim{hdim}_{dtype}{rab_drab}{mask}_sm90.cu"
+        file_name = f"{install_dir}/hstu_bwd_hdim{hdim}_{dtype}{rab_drab}{mask}_sm90.cu"
         if not os.path.exists(file_name):
             with open(file_name, "w") as f:
                 f.write(
@@ -229,8 +237,7 @@ def generate_kernels_hopper():
                 )
 
 
-if __name__ == "__main__":
-
+def main() -> None:
     import argparse
 
     parser = argparse.ArgumentParser()
@@ -240,10 +247,22 @@ def generate_kernels_hopper():
         default="8.0 9.0",
         help="Comma-separated list of CUDA architectures to generate kernels for",
     )
+    parser.add_argument(
+        "--install_dir",
+        type=str,
+        default=None,
+        help="Output directory for generated source files",
+    )
     args = parser.parse_args()
 
     if "8.0" in args.arch_list:
-        generate_kernels_ampere()
+        # In OSS, the generated files will be written to hstu_ampere/instantiations
+        generate_kernels_ampere(args.install_dir or "hstu_ampere/instantiations")
 
     if "9.0" in args.arch_list:
-        generate_kernels_hopper()
+        # In OSS, the generated files will be written to hstu_hopper/instantiations
+        generate_kernels_hopper(args.install_dir or "hstu_hopper/instantiations")
+
+
+if __name__ == "__main__":
+    main()
@@ -1,26 +1,28 @@
 /*
- * Copyright (c) 2023, Tri Dao.
- * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
- * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * Portions Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
 
+/*
+ * Copyright (c) 2023, Tri Dao.
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+ */
+
 // Include these 2 headers instead of torch/extension.h since we don't need all
 // of the torch headers.
 #include <ATen/ATen.h>
-#include <torch/library.h>
-
-#include "c10/core/ScalarType.h"
-
 #include <ATen/core/op_registration/op_registration.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <c10/core/ScalarType.h>
 #include <c10/cuda/CUDAGuard.h>
-#include <c10/util/Optional.h>
+#include <torch/library.h>
 #include <torch/nn/functional.h>
 
+#include <optional>
+
 #include "hstu.h"
 #include "static_switch.h"
 
@@ -440,13 +442,13 @@ std::tuple<at::Tensor, at::Tensor> hstu_varlen_fwd_80(
     const at::Tensor& cu_seqlens_k, // b+1
     const int64_t max_seqlen_q,
     const int64_t max_seqlen_k,
-    const c10::optional<at::Tensor>& num_contexts, // b
-    const c10::optional<at::Tensor>& num_targets, // b
+    const std::optional<at::Tensor>& num_contexts, // b
+    const std::optional<at::Tensor>& num_targets, // b
     const int64_t target_group_size,
     int64_t window_size_left,
     int64_t window_size_right,
     const double alpha,
-    c10::optional<at::Tensor> rab,
+    std::optional<at::Tensor> rab,
     const bool is_delta_q) {
   auto dprops = at::cuda::getCurrentDeviceProperties();
   TORCH_CHECK(dprops->major >= 8, "HSTU only supports Ampere GPUs or newer.");
@@ -738,13 +740,13 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor> hstu_varlen_bwd_80(
     const at::Tensor& cu_seqlens_k, // b+1
     const int64_t max_seqlen_q,
     const int64_t max_seqlen_k,
-    const c10::optional<at::Tensor>& num_contexts, // b
-    const c10::optional<at::Tensor>& num_targets, // b
+    const std::optional<at::Tensor>& num_contexts, // b
+    const std::optional<at::Tensor>& num_targets, // b
     const int64_t target_group_size,
     int64_t window_size_left,
     int64_t window_size_right,
     const double alpha,
-    const c10::optional<at::Tensor>& rab,
+    const std::optional<at::Tensor>& rab,
     const bool has_drab,
     const bool is_delta_q,
     const bool deterministic) {
 
@@ -1,14 +1,17 @@
 /*
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- * Pradeep Ramani, Tri Dao.
- * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
- * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * Portions Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
 
+/*
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ * Pradeep Ramani, Tri Dao.
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+ */
+
 #pragma once
 
 #include "cute/tensor.hpp"
@@ -126,7 +129,7 @@ void run_hstu_bwd(Hstu_bwd_params& params, cudaStream_t stream) {
            params.window_size_left,
            params.window_size_right,
            params.target_group_size,
-           1.0 / params.target_group_size,
+           1.0f / params.target_group_size,
            params.alpha,
            params.dq_semaphore});