Fix OSS GenAI

spcyppt · facebook-github-bot · commit 7a55749cdd8d · 2025-05-09T21:51:07.000-07:00
Summary:
- add quantize_qkv_per_head for cuda &lt;12
- skip silu_mul_quant test as it requires H100 and more memory
- fix lint
- fix moe.parallelism import
- fix fairscale dependency
- disable layers_test as it is not UnitTest.

Reviewed By: q10

Differential Revision: D74493137

fbshipit-source-id: 59923aa312ad8b874ff14767476a286059785a1f
diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
@@ -79,7 +79,7 @@ __configure_fbgemm_gpu_test_cpu () {
   # shellcheck disable=SC2086
   print_exec conda env config vars set ${env_prefix} CUDA_VISIBLE_DEVICES=-1
 
-  ignored_tests=(
+  export ignored_tests=(
     # These tests have non-CPU operators referenced in @given
     ./uvm/copy_test.py
     ./uvm/uvm_test.py
@@ -99,7 +99,8 @@ __configure_fbgemm_gpu_test_cuda () {
   # shellcheck disable=SC2086
   print_exec conda env config vars unset ${env_prefix} CUDA_VISIBLE_DEVICES
 
-  ignored_tests=(
+  export ignored_tests=(
+    ./moe/layers_test.py # not a UnitTest
   )
 }
 
@@ -123,7 +124,7 @@ __configure_fbgemm_gpu_test_rocm () {
     print_exec conda env config vars set ${env_prefix} HSA_XNACK=1
   fi
 
-  ignored_tests=(
+  export ignored_tests=(
     # https://github.com/pytorch/FBGEMM/issues/1559
     ./batched_unary_embeddings_test.py
     ./sll/triton_sll_test.py
diff --git a/fbgemm_gpu/experimental/gen_ai/src/kv_cache/kv_cache.cu b/fbgemm_gpu/experimental/gen_ai/src/kv_cache/kv_cache.cu
@@ -2997,5 +2997,21 @@ std::tuple<at::Tensor, at::Tensor> dequantize_fp8_cache(
   throw std::runtime_error(
       "CUDA version is older than 12.0"); // requires CUDA>=12
 }
+
+at::Tensor quantize_qkv_per_head(
+    at::Tensor xqkv_amax_row, // [B_T, HH]
+    at::Tensor xqkv, // [B_T, HH, D_H]
+    at::Tensor varseq_seqpos, // [B_T]
+    std::optional<at::Tensor> varseq_batch, // [B_T]
+    at::Tensor q_seqstarts, // [B+1]
+    at::Tensor cache_K, // [B][MAX_T][N_KVH][D_H]
+    at::Tensor cache_V, // [B][MAX_T][N_KVH][D_H]
+    at::Tensor XQ_O, // [B_T][N_H][D]
+    int64_t max_seq_length, // Length of the sequence
+    std::optional<at::Tensor> qparam_k,
+    std::optional<at::Tensor> qparam_v) {
+  throw std::runtime_error(
+      "CUDA version is older than 12.0"); // requires CUDA>=12
+}
 #endif
 } // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/experimental/gen_ai/test/moe/activation_test.py b/fbgemm_gpu/experimental/gen_ai/test/moe/activation_test.py
@@ -71,6 +71,11 @@ def ref_fn() -> torch.Tensor:
 
         torch.testing.assert_allclose(y, y_ref, rtol=1.6e-2, atol=1e-3)
 
+    @unittest.skipIf(
+        not torch.cuda.is_available()
+        or torch.cuda.get_device_properties(torch.cuda.current_device()).major < 9,
+        "Skip when H100 is not available",
+    )
     @given(
         T=st.sampled_from([1, 128, 2048, 4096, 16384]),
         D=st.sampled_from([5120, 7168]),
diff --git a/fbgemm_gpu/experimental/gen_ai/test/moe/layers_test.py b/fbgemm_gpu/experimental/gen_ai/test/moe/layers_test.py
@@ -13,15 +13,9 @@
 import traceback
 from datetime import datetime
 from functools import partial
-from typing import Callable, Mapping, Tuple, Union
+from typing import Tuple
 
 import torch
-from deeplearning.fbgemm.fbgemm_gpu.experimental.gen_ai.test.moe.parallelism import (
-    get_ep_group,
-    get_global_rank,
-    get_routed_experts_mp_group,
-    init_parallel,
-)
 from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import triton_quantize_fp8_row
 from fbgemm_gpu.experimental.gen_ai.moe.layers import (
     BaselineMoE,
@@ -34,6 +28,13 @@
 # pyre-fixme[21]: Could not find name `ProfilerActivity` in `torch.profiler`.
 from torch.profiler import profile, ProfilerActivity
 
+from .parallelism import (
+    get_ep_group,
+    get_global_rank,
+    get_routed_experts_mp_group,
+    init_parallel,
+)
+
 TRACE_DIR: str = "/tmp/"
 WARM_UP_ITERS = 15
 PROFILE_ITERS = 20
diff --git a/fbgemm_gpu/requirements.txt b/fbgemm_gpu/requirements.txt
@@ -26,3 +26,4 @@ setuptools
 setuptools_git_versioning
 tabulate
 patchelf
+fairscale
diff --git a/fbgemm_gpu/requirements_genai.txt b/fbgemm_gpu/requirements_genai.txt
@@ -28,3 +28,4 @@ setuptools
 setuptools_git_versioning
 tabulate
 patchelf
+fairscale