Migrate TBE inference kernels to FBGEMM_LAUNCH_KERNEL (#4092)

q10 · facebook-github-bot · commit 1bcf7d325ee3 · 2025-05-09T08:10:14.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1176 - Migrate TBE inference kernels to `FBGEMM_LAUNCH_KERNEL` Reviewed By: spcyppt Differential Revision: D73731461
diff --git a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_split_lookup.cu b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_split_lookup.cu
@@ -7,6 +7,7 @@
  */
 
 #include "fbgemm_gpu/embedding_forward_template_helpers.cuh"
+#include "fbgemm_gpu/utils/kernel_launcher.cuh"
 #include "fbgemm_gpu/utils/tensor_accessor_builder.h"
 
 using namespace fbgemm_gpu;
@@ -170,28 +171,24 @@ Tensor pruned_hashmap_lookup_cuda(
 
         AT_DISPATCH_INDEX_TYPES(
             indices.scalar_type(), "pruned_hashmap_lookup_cuda_1", [&] {
-#ifdef FBGEMM_GPU_MEMCHECK
-              const auto func_name =
-                  "int_nbit_split_embedding_codegen_forward_pruned_hashmap_lookup_kernel";
-#endif
-
-              int_nbit_split_embedding_codegen_forward_pruned_hashmap_lookup_kernel<<<
+              FBGEMM_LAUNCH_KERNEL(
+                  (int_nbit_split_embedding_codegen_forward_pruned_hashmap_lookup_kernel<
+                      index_t,
+                      hash_t>),
                   nbit::div_round_up(B * T + 1, kForwardMaxThreads / kWarpSize),
                   dim3(kWarpSize, kForwardMaxThreads / kWarpSize),
                   0,
-                  at::cuda::getCurrentCUDAStream()>>>(
-                  MAKE_PTA_WITH_NAME(func_name, indices, index_t, 1, 32),
-                  MAKE_PTA_WITH_NAME(func_name, offsets, index_t, 1, 32),
-                  MAKE_PTA_WITH_NAME(func_name, hash_table, hash_t, 2, 64),
-                  MAKE_PTA_WITH_NAME(
-                      func_name, hash_table_offsets, int64_t, 1, 32),
+                  at::cuda::getCurrentCUDAStream(),
+                  PTA_B(indices, index_t, 1, 32),
+                  PTA_B(offsets, index_t, 1, 32),
+                  PTA_B(hash_table, hash_t, 2, 64),
+                  PTA_B(hash_table_offsets, int64_t, 1, 32),
                   B,
                   T,
-                  MAKE_PTA_WITH_NAME(func_name, dense_indices, index_t, 1, 32));
+                  PTA_B(dense_indices, index_t, 1, 32));
             });
       });
 
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
   return dense_indices;
 }
 
@@ -235,29 +232,24 @@ Tensor pruned_array_lookup_cuda(
 
         AT_DISPATCH_INDEX_TYPES(
             indices.scalar_type(), "pruned_array_lookup_cuda_1", [&] {
-#ifdef FBGEMM_GPU_MEMCHECK
-              const auto func_name =
-                  "int_nbit_split_embedding_codegen_forward_pruned_array_lookup_kernel";
-#endif
-
-              int_nbit_split_embedding_codegen_forward_pruned_array_lookup_kernel<<<
+              FBGEMM_LAUNCH_KERNEL(
+                  (int_nbit_split_embedding_codegen_forward_pruned_array_lookup_kernel<
+                      index_t,
+                      remap_t>),
                   nbit::div_round_up(
                       offsets.size(0), kForwardMaxThreads / kWarpSize),
                   dim3(kWarpSize, kForwardMaxThreads / kWarpSize),
                   0,
-                  at::cuda::getCurrentCUDAStream()>>>(
-                  MAKE_PTA_WITH_NAME(func_name, indices, index_t, 1, 32),
-                  MAKE_PTA_WITH_NAME(func_name, offsets, index_t, 1, 32),
-                  MAKE_PTA_WITH_NAME(
-                      func_name, index_remappings, remap_t, 1, 64),
-                  MAKE_PTA_WITH_NAME(
-                      func_name, index_remappings_offsets, int64_t, 1, 32),
+                  at::cuda::getCurrentCUDAStream(),
+                  PTA_B(indices, index_t, 1, 32),
+                  PTA_B(offsets, index_t, 1, 32),
+                  PTA_B(index_remappings, remap_t, 1, 64),
+                  PTA_B(index_remappings_offsets, int64_t, 1, 32),
                   B,
                   T,
-                  MAKE_PTA_WITH_NAME(func_name, dense_indices, index_t, 1, 32));
+                  PTA_B(dense_indices, index_t, 1, 32));
             });
       });
 
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
   return dense_indices;
 }
diff --git a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_split_nbit_host_template.cu b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_split_nbit_host_template.cu
@@ -9,6 +9,7 @@
 // clang-format off
 {%- set wdesc =  "weighted" if weighted else "unweighted" %}
 #include "fbgemm_gpu/embedding_forward_template_helpers.cuh"
+#include "fbgemm_gpu/utils/kernel_launcher.cuh"
 #include "fbgemm_gpu/utils/tensor_accessor_builder.h"
 #include "fbgemm_gpu/config/feature_gates.h"
 
@@ -63,51 +64,46 @@ __global__ void {{ type_map[emb_weight_type].enum_name }}_split_embedding{{ "_no
 {%- macro define_kernel_invocation(emb_weight_type) %}
     {%- set func_name = "nbit::" + emb_weight_type + "_split_embedding" + ("_nobag" if nobag else "") + "_codegen_forward_" + wdesc + "_kernel_small_L" %}
 
-    #ifdef FBGEMM_GPU_MEMCHECK
-    const auto func_name_{{ emb_weight_type }} = "{{ func_name }}_{{ emb_weight_type }}";
-    #endif
-
     #ifdef X
     #undef X
     #endif
 
-    // Define {{ emb_weight_type }} kernel invocation macro
     #define X(DeviceOnly, PackedMode, OutputRowsPerThread, InputRowsInFlight, MinNum128BRows, MaxNum128BRows) \
-    {{ func_name }}<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly, PackedMode><<< \
+      FBGEMM_LAUNCH_KERNEL( \
+        ({{ func_name }}<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly, PackedMode>), \
         nbit::div_round_up(T * nbit::div_round_up(B, num_packed_bags * OutputRowsPerThread), kWarpsPerBlock), \
         dim3(kWarpSize, kWarpsPerBlock), \
         0, \
-        at::cuda::getCurrentCUDAStream()>>>( \
-        MAKE_PTA_WITH_NAME(func_name_{{ emb_weight_type }}, dev_weights, uint8_t, 1, 64), \
-        MAKE_PTA_WITH_NAME(func_name_{{ emb_weight_type }}, uvm_weights, uint8_t, 1, 64), \
-        MAKE_PTA_WITH_NAME(func_name_{{ emb_weight_type }}, weights_placements, int32_t, 1, 32), \
-        MAKE_PTA_WITH_NAME(func_name_{{ emb_weight_type }}, weights_offsets, int64_t, 1, 32), \
-        MAKE_PTA_WITH_NAME(func_name_{{ emb_weight_type }}, weights_tys, uint8_t, 1, 32), \
+        at::cuda::getCurrentCUDAStream(), \
+        PTA_B(dev_weights, uint8_t, 1, 64), \
+        PTA_B(uvm_weights, uint8_t, 1, 64), \
+        PTA_B(weights_placements, int32_t, 1, 32), \
+        PTA_B(weights_offsets, int64_t, 1, 32), \
+        PTA_B(weights_tys, uint8_t, 1, 32), \
         {%- if not nobag %}
-        MAKE_PTA_WITH_NAME(func_name_{{ emb_weight_type }}, D_offsets, int32_t, 1, 32), \
+        PTA_B(D_offsets, int32_t, 1, 32), \
         {%- else %}
         D, \
         {%- endif %}
         FixedDivisor(div_round_up(B, num_packed_bags * OutputRowsPerThread)), \
-        MAKE_PTA_WITH_NAME(func_name_{{ emb_weight_type }}, indices, index_t, 1, 32), \
-        MAKE_PTA_WITH_NAME(func_name_{{ emb_weight_type }}, offsets, index_t, 1, 32), \
+        PTA_B(indices, index_t, 1, 32), \
+        PTA_B(offsets, index_t, 1, 32), \
         {%- if not nobag %}
         pooling_mode, \
         {%- endif %}
         row_alignment, \
         {%- if weighted %}
-        MAKE_PTA_WITH_NAME(func_name_{{ emb_weight_type }}, indice_weights, float, 1, 32), \
+        PTA_B(indice_weights, float, 1, 32), \
         {%- endif %}
         {%- if emb_weight_type == "FP8" %}
         fp8_exponent_bits, \
         fp8_exponent_bias, \
         {%- endif %}
         num_packed_bags, \
-        MAKE_PTA_WITH_NAME(func_name_{{ emb_weight_type }}, output, output_t, 2, 32), \
-        MAKE_PTA_WITH_NAME(func_name_{{ emb_weight_type }}, lxu_cache_weights, uint8_t, 2, 64), \
-        MAKE_PTA_WITH_NAME(func_name_{{ emb_weight_type }}, lxu_cache_locations, int32_t, 1, 32) \
-    ); \
-    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
+        PTA_B(output, output_t, 2, 32), \
+        PTA_B(lxu_cache_weights, uint8_t, 2, 64), \
+        PTA_B(lxu_cache_locations, int32_t, 1, 32) \
+      );
 {%- endmacro %}
 
 {%- macro construct_and_return_output_tensor() %}