sigridjineth
diff --git a/‎.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
Lines changed: 1 addition & 1 deletion b/‎.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
Lines changed: 34 additions & 1 deletion b/‎benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
Lines changed: 34 additions & 1 deletion
diff --git a/‎benchmarks/kernels/benchmark_moe.py
Lines changed: 5 additions & 0 deletions b/‎benchmarks/kernels/benchmark_moe.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎csrc/moe/moe_permute_unpermute_op.cu
Lines changed: 42 additions & 11 deletions b/‎csrc/moe/moe_permute_unpermute_op.cu
Lines changed: 42 additions & 11 deletions
diff --git a/‎csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
Lines changed: 37 additions & 12 deletions b/‎csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu
Lines changed: 37 additions & 12 deletions
@@ -70,7 +70,7 @@ export VLLM_XLA_CACHE_PATH=
 echo "Using VLLM V1"
 
 echo "--- Hardware Information ---"
-tpu-info
+# tpu-info
 echo "--- Starting Tests ---"
 set +e
 overall_script_exit_code=0
 
@@ -45,7 +45,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from docker/Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")
 set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
 
 #
 
@@ -80,6 +80,11 @@ def bench_run(
         a, score, topk, renormalize=False
     )
 
+    ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
+    ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
+    c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
+    c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
+
     def run_triton_moe(
         a: torch.Tensor,
         w1: torch.Tensor,
@@ -111,6 +116,10 @@ def run_cutlass_moe(
         w2: torch.Tensor,
         w1_scale: torch.Tensor,
         w2_scale: torch.Tensor,
+        ab_strides1: torch.Tensor,
+        ab_strides2: torch.Tensor,
+        c_strides1: torch.Tensor,
+        c_strides2: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         per_act_token: bool,
@@ -125,6 +134,10 @@ def run_cutlass_moe(
                 topk_ids,
                 w1_scale,
                 w2_scale,
+                ab_strides1,
+                ab_strides2,
+                c_strides1,
+                c_strides2,
                 per_act_token,
                 a1_scale=None,
             )
@@ -136,6 +149,10 @@ def run_cutlass_from_graph(
         w2_q: torch.Tensor,
         w1_scale: torch.Tensor,
         w2_scale: torch.Tensor,
+        ab_strides1: torch.Tensor,
+        ab_strides2: torch.Tensor,
+        c_strides1: torch.Tensor,
+        c_strides2: torch.Tensor,
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
     ):
@@ -150,6 +167,10 @@ def run_cutlass_from_graph(
                 topk_ids,
                 w1_scale,
                 w2_scale,
+                ab_strides1,
+                ab_strides2,
+                c_strides1,
+                c_strides2,
                 per_act_token,
                 a1_scale=None,
             )
@@ -194,6 +215,10 @@ def replay_graph(graph, num_repeats):
             w2_q,
             w1_scale,
             w2_scale,
+            ab_strides1,
+            ab_strides2,
+            c_strides1,
+            c_strides2,
             topk_weights,
             topk_ids,
         )
@@ -231,6 +256,10 @@ def replay_graph(graph, num_repeats):
         "w1_scale": w1_scale,
         "w2_scale": w2_scale,
         "per_act_token": per_act_token,
+        "ab_strides1": ab_strides1,
+        "ab_strides2": ab_strides2,
+        "c_strides1": c_strides1,
+        "c_strides2": c_strides2,
         # cuda graph params
         "cutlass_graph": cutlass_graph,
         "triton_graph": triton_graph,
@@ -289,6 +318,10 @@ def replay_graph(graph, num_repeats):
         w2_q,
         w1_scale,
         w2_scale,
+        ab_strides1,
+        ab_strides2,
+        c_strides1,
+        c_strides2,
         topk_weights,
         topk_ids,
         per_act_token,
@@ -297,7 +330,7 @@ def replay_graph(graph, num_repeats):
 
     results.append(
         benchmark.Timer(
-            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)",  # noqa: E501
+            stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
 
@@ -586,6 +586,11 @@ def main(args: argparse.Namespace):
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
+        E = config.num_experts
+        topk = config.moe_topk[0]
+        intermediate_size = config.moe_intermediate_size[0]
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
     else:
         # Support for llama4
         config = config.get_text_config()
 
@@ -160,6 +160,30 @@ __global__ void shuffleInputRowsKernel(const T* input,
   }
 }
 
+template <typename T>
+__global__ void shuffleInputRowsKernelSlow(const T* input,
+                                           const int32_t* dst2src_map,
+                                           T* output, int64_t num_src_rows,
+                                           int64_t num_dst_rows,
+                                           int64_t num_cols) {
+  int64_t dest_row_idx = blockIdx.x;
+  int64_t const source_row_idx = dst2src_map[dest_row_idx];
+
+  if (blockIdx.x < num_dst_rows) {
+    // Duplicate and permute rows
+    auto const* source_row_ptr = input + source_row_idx * num_cols;
+    auto* dest_row_ptr = output + dest_row_idx * num_cols;
+
+    int64_t const start_offset = threadIdx.x;
+    int64_t const stride = blockDim.x;
+
+    for (int elem_index = start_offset; elem_index < num_cols;
+         elem_index += stride) {
+      dest_row_ptr[elem_index] = source_row_ptr[elem_index];
+    }
+  }
+}
+
 void shuffle_rows(const torch::Tensor& input_tensor,
                   const torch::Tensor& dst2src_map,
                   torch::Tensor& output_tensor) {
@@ -173,17 +197,24 @@ void shuffle_rows(const torch::Tensor& input_tensor,
   int64_t const num_src_rows = input_tensor.size(0);
   int64_t const num_cols = input_tensor.size(1);
 
-  TORCH_CHECK(!(num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)),
-              "num_cols must be divisible by 128 / "
-              "sizeof(input_tensor.scalar_type()) / 8");
-
-  MOE_DISPATCH(input_tensor.scalar_type(), [&] {
-    shuffleInputRowsKernel<scalar_t><<<blocks, threads, 0, stream>>>(
-        reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
-        dst2src_map.data_ptr<int32_t>(),
-        reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
-        num_dest_rows, num_cols);
-  });
+  if (num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)) {
+    // use slow kernel if num_cols can't be aligned to 128 bits
+    MOE_DISPATCH(input_tensor.scalar_type(), [&] {
+      shuffleInputRowsKernelSlow<scalar_t><<<blocks, threads, 0, stream>>>(
+          reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
+          dst2src_map.data_ptr<int32_t>(),
+          reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
+          num_dest_rows, num_cols);
+    });
+  } else {
+    MOE_DISPATCH(input_tensor.scalar_type(), [&] {
+      shuffleInputRowsKernel<scalar_t><<<blocks, threads, 0, stream>>>(
+          reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
+          dst2src_map.data_ptr<int32_t>(),
+          reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
+          num_dest_rows, num_cols);
+    });
+  }
 }
 
 #else
 
@@ -29,19 +29,36 @@ struct sm90_fp8_config_default {
 
 template <typename InType, typename OutType,
           template <typename, typename, typename> typename Epilogue>
-struct sm90_fp8_config_M16 {
-  // M in [1, 16]
+struct sm90_fp8_config_M4 {
+  // M in [1, 4]
   static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
   using KernelSchedule =
       cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
   using EpilogueSchedule =
       cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
-  using TileShape = cute::Shape<cute::_64, cute::_64, cute::_128>;
-  using ClusterShape = cute::Shape<cute::_1, cute::_4, cute::_1>;
+  using TileShape = cute::Shape<cute::_128, cute::_16, cute::_128>;
+  using ClusterShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
 
   using Cutlass3xGemm =
       cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
-                            KernelSchedule, EpilogueSchedule>;
+                            KernelSchedule, EpilogueSchedule, true>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M64 {
+  // M in (4, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule =
+      cutlass::epilogue::PtrArrayTmaWarpSpecializedPingpong;
+  using TileShape = cute::Shape<cute::_128, cute::_16, cute::_256>;
+  using ClusterShape = cute::Shape<cute::_2, cute::_1, cute::_1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_group_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                            KernelSchedule, EpilogueSchedule, true>;
 };
 
 template <typename InType, typename OutType,
@@ -102,7 +119,9 @@ void run_cutlass_moe_mm_sm90(
       InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
   using Cutlass3xGemmK8192 = typename sm90_fp8_config_K8192<
       InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
-  using Cutlass3xGemmM16 = typename sm90_fp8_config_M16<
+  using Cutlass3xGemmM4 = typename sm90_fp8_config_M4<
+      InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 = typename sm90_fp8_config_M64<
       InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
   using Cutlass3xGemmDefault = typename sm90_fp8_config_default<
       InType, OutType, vllm::c3x::ScaledEpilogueArray>::Cutlass3xGemm;
@@ -111,18 +130,24 @@ void run_cutlass_moe_mm_sm90(
   uint32_t const n = out_tensors.size(1);
   uint32_t const k = a_tensors.size(1);
 
-  if (n >= 8192) {
-    cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
+  // Use swap_ab for M <= 64 by default to reduce padding
+  if (m <= 4) {
+    cutlass_group_gemm_caller<Cutlass3xGemmM4>(
         out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
         problem_sizes, a_strides, b_strides, c_strides, per_act_token,
         per_out_ch);
-  } else if (k >= 8192) {
-    cutlass_group_gemm_caller<Cutlass3xGemmK8192>(
+  } else if (m <= 64) {
+    cutlass_group_gemm_caller<Cutlass3xGemmM64>(
         out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
         problem_sizes, a_strides, b_strides, c_strides, per_act_token,
         per_out_ch);
-  } else if (m <= 16) {
-    cutlass_group_gemm_caller<Cutlass3xGemmM16>(
+  } else if (n >= 8192) {
+    cutlass_group_gemm_caller<Cutlass3xGemmN8192>(
+        out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
+        problem_sizes, a_strides, b_strides, c_strides, per_act_token,
+        per_out_ch);
+  } else if (k >= 8192) {
+    cutlass_group_gemm_caller<Cutlass3xGemmK8192>(
         out_tensors, a_tensors, b_tensors, a_scales, b_scales, expert_offsets,
         problem_sizes, a_strides, b_strides, c_strides, per_act_token,
         per_out_ch);
Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1`
`45`	`45`	`# requirements.txt files and should be kept consistent. The ROCm torch`
`46`	`46`	`# versions are derived from docker/Dockerfile.rocm`
`47`	`47`	`#`
`48`		`-set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")`
	`48`	`+set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")`
`49`	`49`	`set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")`
`50`	`50`
`51`	`51`	`#`