tbe cpu nobag dispatch and forward pass kernel impl (#4302)

Alexander Balaban · facebook-github-bot · commit 01a187d23795 · 2025-06-11T16:41:50.000-07:00
Summary: Pull Request resolved: #4302 X-link: facebookresearch/FBGEMM#1378 diff introduces simple forward pass kernel for cpu to cover tbe with pooling mode none Differential Revision: D75464152
diff --git a/fbgemm_gpu/cmake/tbe_sources.py b/fbgemm_gpu/cmake/tbe_sources.py
@@ -319,6 +319,7 @@
 static_cpu_files_common = [
     "codegen/utils/embedding_bounds_check_host_cpu.cpp",
     "codegen/training/forward/embedding_forward_split_cpu.cpp",
+    "codegen/training/forward/embedding_forward_split_nobag_cpu.cpp",
     "codegen/training/pt2/pt2_autograd_utils.cpp",
 ]
 
diff --git a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_nobag_cpu.cpp b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_nobag_cpu.cpp
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/Parallel.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <torch/script.h>
+
+#include "fbgemm_gpu/embedding_common.h"
+#include "fbgemm_gpu/utils/cpu_utils.h"
+#include "fbgemm_gpu/utils/dispatch_macros.h"
+#include "fbgemm_gpu/utils/ops_utils.h"
+
+#if FBGEMM_GPU_MEMCHECK
+#define FBGEMM_MEM_CHECK_ONLY
+#else
+#define FBGEMM_MEM_CHECK_ONLY maybe_unused
+#endif
+
+using Tensor = at::Tensor;
+using namespace fbgemm_gpu;
+
+template <
+    typename weights_t,
+    typename index_t,
+    typename offset_t,
+    typename output_t>
+void split_embedding_nobag_codegen_forward_cpu_kernel(
+    const Tensor& weights,
+    const Tensor& weights_offsets,
+    int64_t D,
+    const Tensor& hash_size_cumsum,
+    const Tensor& indices,
+    const Tensor& offsets,
+    const Tensor& output) {
+  TORCH_CHECK(weights.is_contiguous());
+  Tensor indices_contig = indices.contiguous();
+  Tensor offsets_contig = offsets.contiguous();
+
+  const auto weights_offsets_data = weights_offsets.accessor<int64_t, 1>();
+  const auto hash_size_cumsum_data = hash_size_cumsum.accessor<int64_t, 1>();
+  const auto indices_data = indices.data_ptr<index_t>();
+  const auto offsets_data = offsets.data_ptr<offset_t>();
+  const auto weights_data = weights.data_ptr<weights_t>();
+  auto output_data = output.data_ptr<output_t>();
+
+  int64_t T = weights_offsets.size(0);
+  int64_t B = (offsets.size(0) - 1) / T;
+  TORCH_CHECK_GE(B, 0);
+
+  at::parallel_for(0, T, 0, [&](int64_t t_begin, int64_t t_end) {
+    for (const auto t : c10::irange(t_begin, t_end)) {
+      int64_t hash_size = 0;
+      int64_t t_temp = static_cast<int64_t>(t) + 1;
+      do {
+        hash_size = hash_size_cumsum_data[t_temp] - hash_size_cumsum_data[t];
+        ++t_temp;
+      } while (hash_size == 0);
+
+      const auto table_begin = weights_offsets_data[t];
+
+      bool success = true;
+      at::parallel_for(0, B, 0, [&](int64_t b_begin, int64_t b_end) {
+        for (const auto b : c10::irange(b_begin, b_end)) {
+          const auto indices_start = offsets_data[t * B + b];
+          const auto indices_end = offsets_data[t * B + b + 1];
+          for (auto i = indices_start; i < indices_end; ++i) {
+            const auto idx = indices_data[i];
+            if (idx < 0 || idx >= hash_size) {
+              success = false;
+              continue;
+            }
+            const auto embedding_offset = table_begin + idx * D;
+            for (const auto d : c10::irange(D)) {
+              output_data[i * D + d] =
+                  static_cast<output_t>(weights_data[embedding_offset + d]);
+            }
+          }
+        }
+      });
+
+      if (!success) {
+        fbgemm_gpu::report_embedding_error(
+            static_cast<int>(t),
+            static_cast<int>(B),
+            0,
+            static_cast<int>(B),
+            offsets_data,
+            indices_data,
+            hash_size);
+      }
+    }
+  });
+}
+
+Tensor split_embedding_nobag_codegen_forward_cpu(
+    const Tensor& weights,
+    const Tensor& weights_offsets,
+    int64_t D,
+    const Tensor& hash_size_cumsum,
+    const Tensor& indices,
+    const Tensor& offsets,
+    int64_t output_dtype) {
+  int64_t num_indices = indices.size(0);
+  auto options = weights.options();
+  if (output_dtype == static_cast<int64_t>(SparseType::FP32)) {
+    options = weights.options().dtype(at::kFloat);
+  } else if (output_dtype == static_cast<int64_t>(SparseType::FP16)) {
+    options = weights.options().dtype(at::kHalf);
+  } else if (output_dtype == static_cast<int64_t>(SparseType::BF16)) {
+    options = weights.options().dtype(at::kBFloat16);
+  }
+  Tensor output = at::empty({num_indices, D}, options);
+
+  // Dispatch based on indices, offsets, and output types
+  FBGEMM_DISPATCH_FLOAT_AND_HALF(
+      output.scalar_type(), "split_embedding_nobag_cpu_forward_1", [&]() {
+        using output_t = scalar_t;
+
+        FBGEMM_DISPATCH_FLOAT_HALF_AND_BYTE(
+            weights.scalar_type(), "split_embedding_nobag_cpu_forward_2", [&] {
+              using weights_t = scalar_t;
+
+              AT_DISPATCH_INDEX_TYPES(
+                  offsets.scalar_type(),
+                  "split_embedding_nobag_cpu_forward_3",
+                  [&] {
+                    using offset_t = index_t;
+
+                    AT_DISPATCH_INDEX_TYPES(
+                        indices.scalar_type(),
+                        "split_embedding_nobag_cpu_forward_4",
+                        [&] {
+                          split_embedding_nobag_codegen_forward_cpu_kernel<
+                              weights_t,
+                              index_t,
+                              offset_t,
+                              output_t>(
+                              weights,
+                              weights_offsets,
+                              D,
+                              hash_size_cumsum,
+                              indices,
+                              offsets,
+                              output);
+                        });
+                  });
+            });
+      });
+
+  return output;
+}
+
+Tensor split_embedding_nobag_codegen_forward_cpu_meta(
+    const Tensor& weights,
+    const Tensor& /* weights_offsets */,
+    int64_t D,
+    const Tensor& /* hash_size_cumsum */,
+    const Tensor& indices,
+    const Tensor& /* offsets */,
+    int64_t output_dtype) {
+  c10::SymInt num_indices = indices.sym_size(0);
+  auto dtype = weights.options();
+  if (output_dtype == static_cast<int64_t>(SparseType::FP32)) {
+    dtype = weights.options().dtype(at::kFloat);
+  } else if (output_dtype == static_cast<int64_t>(SparseType::FP16)) {
+    dtype = weights.options().dtype(at::kHalf);
+  } else if (output_dtype == static_cast<int64_t>(SparseType::BF16)) {
+    dtype = weights.options().dtype(at::kBFloat16);
+  }
+  return at::empty_symint({num_indices, D}, dtype);
+}
+
+namespace {
+
+TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
+  m.def(
+      "split_embedding_nobag_codegen_forward_cpu(Tensor weights, "
+      "                                          Tensor weights_offsets, "
+      "                                          int D, "
+      "                                          Tensor hash_size_cumsum, "
+      "                                          Tensor indices, "
+      "                                          Tensor offsets, "
+      "                                          int output_dtype) -> Tensor");
+
+  DISPATCH_TO_CPU(
+      "split_embedding_nobag_codegen_forward_cpu",
+      split_embedding_nobag_codegen_forward_cpu);
+
+  DISPATCH_TO_META(
+      "split_embedding_nobag_codegen_forward_cpu",
+      split_embedding_nobag_codegen_forward_cpu_meta);
+}
+} // namespace
diff --git a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_cpu_wrapper_template.cpp b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_cpu_wrapper_template.cpp
@@ -100,26 +100,35 @@ Tensor split_embedding_codegen_grad_indice_weights{{ vdesc }}_pt2_cpu_wrapper(
       feature_requires_grad);
 }
 {%- endif %}
+
 {%- for weighted in [True, False] %}
 {%- set wdesc = "weighted" if weighted else "unweighted" %}
+{%- for nobag in ([False] if (weighted or vbe) else [True, False]) %}
+{%- set ndesc = "_nobag" if nobag else "" %}
 
 {% if is_forward %}
 {#-/* PT2 wrapper function for forward CPU */#}
-Tensor split_embedding_codegen_forward_{{ wdesc }}{{ vdesc }}_pt2_cpu_wrapper(
+Tensor split_embedding{{ ndesc }}_codegen_forward_{{ wdesc }}{{ vdesc }}_pt2_cpu_wrapper(
     const Tensor& host_weights,
     const Tensor& /*dev_weights*/,
     const Tensor& /*uvm_weights*/,
     const Tensor& /*lxu_cache_weights*/,
     const Tensor& /*weights_placements*/,
     const Tensor& weights_offsets,
+    {%- if nobag %}
+    const c10::SymInt D,
+    {%- else %}
     const Tensor& D_offsets,
     const c10::SymInt total_D,
     const c10::SymInt /*max_D*/,
+    {%- endif %}
     const Tensor& hash_size_cumsum,
     const Tensor& indices,
     const Tensor& offsets,
+    {%- if not nobag %}
     const int64_t pooling_mode,
     const Tensor& indice_weights,
+    {%- endif %}
     const Tensor& /*lxu_cache_locations*/,
     const Tensor& /*uvm_cache_stats*/,
     {%- if vbe %}
@@ -142,11 +151,34 @@ Tensor split_embedding_codegen_forward_{{ wdesc }}{{ vdesc }}_pt2_cpu_wrapper(
         offsets_ = reshape_vbe_offsets<index_t>(offsets, vbe_B_offsets_rank_per_feature, max_B_int, D_offsets.numel() - 1);
     });
     {%- endif %}
+    {%- set op = "split_embedding{}_codegen_forward_cpu".format(
+        ndesc
+    )
+    %}
     static auto op =
         torch::Dispatcher::singleton()
-            .findSchemaOrThrow("fbgemm::split_embedding_codegen_forward_cpu", "")
+            .findSchemaOrThrow("fbgemm::{{ op }}", "")
             .typed<Tensor(
-                    Tensor, Tensor, Tensor, c10::SymInt, Tensor, Tensor, Tensor, int64_t, Tensor, int64_t
+                {%- if nobag %}
+                const Tensor&, /*weights*/
+                const Tensor&, /*weights_offsets*/
+                c10::SymInt, /*D*/
+                const Tensor&, /*hash_size_cumsum*/
+                const Tensor&, /*indices*/
+                const Tensor&, /*offsets*/
+                int64_t /*output_dtype*/
+                {%- else %}
+                Tensor, /*weights*/
+                Tensor, /*weights_offsets*/
+                Tensor, /*D_offsets*/
+                c10::SymInt, /*total_D*/
+                Tensor, /*hash_size_cumsum*/
+                Tensor, /*indices*/
+                Tensor, /*offsets*/
+                int64_t, /*pooling_mode*/
+                Tensor, /*indice_weights*/
+                int64_t /*output_dtype*/
+                {%- endif %}
             )>();
     {%- if vbe %}
     // TODO: remove this after vbe is implemented for CPU kernel
@@ -189,18 +221,25 @@ Tensor split_embedding_codegen_forward_{{ wdesc }}{{ vdesc }}_pt2_cpu_wrapper(
     return op.call(
         host_weights,
         weights_offsets,
+        {%- if nobag %}
+        D,
+        {%- else %}
         D_offsets,
         total_D,
+        {%- endif %}
         hash_size_cumsum,
         indices,
         offsets,
+        {%- if not nobag %}
         pooling_mode,
         indice_weights,
+        {%- endif %}
         output_dtype);
     {%- endif %}
     }
 {% else %}
 {#-/* PT2 wrapper function for backward CPU */#}
+{%- if not nobag %}
 Tensor split_embedding_backward_codegen_{{ optimizer }}_{{ wdesc }}{{ vdesc }}_pt2_cpu_wrapper(
     const Tensor& grad_output,
     const Tensor& host_weights,
@@ -296,18 +335,22 @@ Tensor split_embedding_backward_codegen_{{ optimizer }}_{{ wdesc }}{{ vdesc }}_p
             );
         return Tensor();
     }
+{% endif %} {#-/*if not nobag*/#}
 {% endif %}
+{%- endfor %} {#-/*for nobag*/#}
 {%- endfor %} {#-/*for weighted*/#}
 
 
 namespace {
 TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
     {%- for weighted in [True, False] %}
     {%- set wdesc = "weighted" if weighted else "unweighted" %}
+    {%- for nobag in ([False] if (weighted or vbe) else [True, False]) %}
+    {%- set ndesc = "_nobag" if nobag else "" %}
     
     {%- if is_forward %}
-    {%- set embedding_codegen_forward_op = "split_embedding_codegen_forward_{}{}_pt2".format(
-        wdesc, vdesc
+    {%- set embedding_codegen_forward_op = "split_embedding{}_codegen_forward_{}{}_pt2".format(
+        ndesc, wdesc, vdesc
         )
     %}
     
@@ -360,6 +403,7 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
     DISPATCH_TO_CPU("{{ embedding_codegen_forward_op }}_wrapper", {{ embedding_codegen_forward_op }}_cpu_wrapper);
 
     {%- else %} {#-/* backward */#}
+    {%- if not nobag %}
     {%- set embedding_codegen_backward_op = "split_embedding_backward_codegen_{}_{}{}_pt2".format(
         optimizer, wdesc, vdesc
         )
@@ -410,7 +454,9 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
         {%- endif %}
         ") -> Tensor");
     DISPATCH_TO_CPU("{{ embedding_codegen_backward_op }}_wrapper", {{ embedding_codegen_backward_op }}_cpu_wrapper);
+    {%- endif %} {#-/*if not nobag*/#}
     {%- endif %} {#-/*if is_forward*/#}
+    {%- endfor %} {#-/*for nobag*/#}
     {%- endfor %} {#-/*for weighted*/#}
 
     {%- if is_forward %}
diff --git a/fbgemm_gpu/test/tbe/training/forward_test.py b/fbgemm_gpu/test/tbe/training/forward_test.py

Original file line number	Diff line number	Diff line change
`@@ -319,6 +319,7 @@`
`319`	`319`	`static_cpu_files_common = [`
`320`	`320`	`"codegen/utils/embedding_bounds_check_host_cpu.cpp",`
`321`	`321`	`"codegen/training/forward/embedding_forward_split_cpu.cpp",`
	`322`	`+ "codegen/training/forward/embedding_forward_split_nobag_cpu.cpp",`
`322`	`323`	`"codegen/training/pt2/pt2_autograd_utils.cpp",`
`323`	`324`	`]`
`324`	`325`