PaddlePaddle
diff --git a/‎csrc/gpu/all_reduce.cu
Lines changed: 2 additions & 0 deletions b/‎csrc/gpu/all_reduce.cu
Lines changed: 2 additions & 0 deletions
diff --git a/‎csrc/gpu/all_reduce.cuh
Lines changed: 1 addition & 1 deletion b/‎csrc/gpu/all_reduce.cuh
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/gpu/multi_head_latent_attention.cu
Lines changed: 2 additions & 0 deletions b/‎csrc/gpu/multi_head_latent_attention.cu
Lines changed: 2 additions & 0 deletions
diff --git a/‎csrc/setup_cuda.py
Lines changed: 3 additions & 3 deletions b/‎csrc/setup_cuda.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎csrc/tools/build_wheel.sh
Lines changed: 13 additions & 2 deletions b/‎csrc/tools/build_wheel.sh
Lines changed: 13 additions & 2 deletions
diff --git a/‎llm/config/llama/grpo_argument.yaml
Lines changed: 2 additions & 2 deletions b/‎llm/config/llama/grpo_argument.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎llm/config/qwen/grpo_32b_argument.yaml
Lines changed: 2 additions & 2 deletions b/‎llm/config/qwen/grpo_32b_argument.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎llm/config/qwen/grpo_argument.yaml
Lines changed: 2 additions & 2 deletions b/‎llm/config/qwen/grpo_argument.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎llm/config/qwen/reinforce_plus_plus_argument.yaml
Lines changed: 2 additions & 2 deletions b/‎llm/config/qwen/reinforce_plus_plus_argument.yaml
Lines changed: 2 additions & 2 deletions
@@ -72,12 +72,14 @@ void all_reduce(fptr_t _fa, paddle::Tensor& inp, paddle::Tensor& out,
                           reinterpret_cast<half*>(out.data()), out.numel());
       break;
     }
+#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800)
     case phi::DataType::BFLOAT16: {
       fa->allreduce<nv_bfloat16>(
           stream, reinterpret_cast<nv_bfloat16*>(reg_buffer),
           reinterpret_cast<nv_bfloat16*>(out.data()), out.numel());
       break;
     }
+#endif
     default:
       throw std::runtime_error(
           "custom allreduce only supports float32, float16 and bfloat16");
 
@@ -98,7 +98,7 @@ DINLINE half& assign_add(half& a, half b) {
 }
 DINLINE float& assign_add(float& a, float b) { return a += b; }
 
-#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800)
 DINLINE float upcast_s(nv_bfloat16 val) { return __bfloat162float(val); }
 template <>
 DINLINE nv_bfloat16 downcast_s(float val) {
 
@@ -205,6 +205,7 @@ std::vector<paddle::Tensor> MultiHeadLatentAttention(
   meta_data.batch_size = cum_offsets.dims()[0];
 
   switch (query.dtype()) {
+#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800)
     case paddle::DataType::BFLOAT16: {
       return MultiHeadLatentAttentionKernel<paddle::DataType::BFLOAT16>(
           meta_data,
@@ -253,6 +254,7 @@ std::vector<paddle::Tensor> MultiHeadLatentAttention(
           causal,
           speculate_decoder);
     }
+#endif
     case paddle::DataType::FLOAT16: {
       return MultiHeadLatentAttentionKernel<paddle::DataType::FLOAT16>(
           meta_data,
 
@@ -130,14 +130,11 @@ def get_gencode_flags():
     "./gpu/speculate_decoding_kernels/speculate_save_output.cc",
     "./gpu/speculate_decoding_kernels/speculate_get_output.cc",
     "./gpu/save_output_dygraph.cu",
-    "./gpu/cpp_extensions.cu",
     "./gpu/all_reduce.cu",
     "./gpu/quantization/per_token_group_quant.cu",
     "./gpu/quantization/per_tensor_quant_fp8.cu",
 ]
 sources += find_end_files("./gpu/speculate_decoding_kernels", ".cu")
-sources += find_end_files("./gpu/moe/fused_moe/cutlass_kernels/moe_gemm/", ".cu")
-sources += find_end_files("./gpu/moe/fused_moe/", ".cu")
 
 nvcc_compile_args = gencode_flags
 update_git_submodule()
@@ -174,6 +171,9 @@ def get_gencode_flags():
 
     sources += find_end_files("./gpu/append_attn", ".cu")
     sources += find_end_files("./gpu/append_attn/template_instantiation", ".cu")
+    sources += find_end_files("./gpu/moe/fused_moe/cutlass_kernels/moe_gemm/", ".cu")
+    sources += find_end_files("./gpu/moe/fused_moe/", ".cu")
+    sources += "./gpu/cpp_extensions.cu",
 
 
 fp8_auto_gen_directory = "gpu/cutlass_kernels/fp8_gemm_fused/autogen"
 
@@ -61,7 +61,7 @@ function generate_sm_version(){
         sm_versions=($SM_VERSION )
     elif [ "$ARCHITECTURE" = "all" ]; then
         if awk -v version="$cuda_version" 'BEGIN { exit !(version >= 12.0) }'; then
-          sm_versions=(70 75 80 80 86 89 90 )
+          sm_versions=(70 75 80 86 89 90 )
         else
           sm_versions=(70 75 80 86 89 ) 
         fi 
@@ -93,6 +93,7 @@ function create_directories(){
 
 import os
 from datetime import datetime
+import paddle
 
 from setuptools import find_packages, setup
 
@@ -109,14 +110,24 @@ def read(file: str):
         content = f.read().strip()
     return content
 
+def get_sm_version():
+    prop = paddle.device.cuda.get_device_properties()
+    cc = prop.major * 10 + prop.minor
+    return cc
 
 def read_version():
     """
     read version and return content
     """
     __version__ = "3.0.0b4.post"
+
     formatted_date = datetime.now().date().strftime("%Y%m%d")
-    __version__ = __version__.replace(".post", ".post{}".format(formatted_date))
+    cuda_version = float(paddle.version.cuda())
+    sm_version = get_sm_version()
+    paddle_commit = paddle.__git_commit__[:7]
+    build_tag = "{}+cuda{}sm{}paddle{}".format(formatted_date, cuda_version, sm_version, paddle_commit)
+
+    __version__ = __version__.replace(".post", ".post{}".format(build_tag))
     
     return __version__
 
 
@@ -108,8 +108,8 @@ recompute_granularity: "full" # Granularity of recompute
 bf16: true # Whether to use mixed precision with bfloat16
 fp16_opt_level: "O2" # Optimization level for fp16 and bf16 training
 amp_master_grad: false # Whether to use float32 weight gradients for master weights in amp opt level=’O2’
-amp_custom_black_list: "reduce_sum softmax_with_cross_entropy c_softmax_with_cross_entropy elementwise_div sin cos" # Custom black list for amp
-amp_custom_white_list: "lookup_table lookup_table_v2 flash_attn matmul matmul_v2 fused_gemm_epilogue" # Custom white list for amp
+amp_custom_black_list: ["reduce_sum", "softmax_with_cross_entropy", "c_softmax_with_cross_entropy", "elementwise_div", "sin", "cos"] # Custom black list for amp
+amp_custom_white_list: ["lookup_table", "lookup_table_v2", "flash_attn", "matmul", "matmul_v2", "fused_gemm_epilogue"] # Custom white list for amp
 offload_level: "freeze_model" # Level of model offloading to pinned memory, supported values: freeze_model, train_model, optimizer
 release_grads: true # Whether to release gradients
 offload_optim: false # Whether to offload optimizer to pinned memory
 
@@ -109,8 +109,8 @@ recompute_granularity: "full" # Granularity of recompute
 bf16: true # Whether to use mixed precision with bfloat16
 fp16_opt_level: "O2" # Optimization level for fp16 and bf16 training
 amp_master_grad: false # Whether to use float32 weight gradients for master weights in amp opt level=’O2’
-amp_custom_black_list: "reduce_sum softmax_with_cross_entropy c_softmax_with_cross_entropy elementwise_div sin cos" # Custom black list for amp
-amp_custom_white_list: "lookup_table lookup_table_v2 flash_attn matmul matmul_v2 fused_gemm_epilogue" # Custom white list for amp
+amp_custom_black_list: ["reduce_sum", "softmax_with_cross_entropy", "c_softmax_with_cross_entropy", "elementwise_div", "sin", "cos"] # Custom black list for amp
+amp_custom_white_list: ["lookup_table", "lookup_table_v2", "flash_attn", "matmul", "matmul_v2", "fused_gemm_epilogue"] # Custom white list for amp
 offload_level: "freeze_model" # Level of model offloading to pinned memory, supported values: freeze_model, train_model, optimizer
 release_grads: true # Whether to release gradients
 offload_optim: true # Whether to offload optimizer to pinned memory
 
@@ -109,8 +109,8 @@ recompute_granularity: "full" # Granularity of recompute
 bf16: true # Whether to use mixed precision with bfloat16
 fp16_opt_level: "O2" # Optimization level for fp16 and bf16 training
 amp_master_grad: false # Whether to use float32 weight gradients for master weights in amp opt level=’O2’
-amp_custom_black_list: "reduce_sum softmax_with_cross_entropy c_softmax_with_cross_entropy elementwise_div sin cos" # Custom black list for amp
-amp_custom_white_list: "lookup_table lookup_table_v2 flash_attn matmul matmul_v2 fused_gemm_epilogue" # Custom white list for amp
+amp_custom_black_list: ["reduce_sum", "softmax_with_cross_entropy", "c_softmax_with_cross_entropy", "elementwise_div", "sin", "cos"] # Custom black list for amp
+amp_custom_white_list: ["lookup_table", "lookup_table_v2", "flash_attn", "matmul", "matmul_v2", "fused_gemm_epilogue"] # Custom white list for amp
 offload_level: "freeze_model" # Level of model offloading to pinned memory, supported values: freeze_model, train_model, optimizer
 release_grads: true # Whether to release gradients
 offload_optim: false # Whether to offload optimizer to pinned memory
 
@@ -109,8 +109,8 @@ recompute_granularity: "full" # Granularity of recompute
 bf16: true # Whether to use mixed precision with bfloat16
 fp16_opt_level: "O2" # Optimization level for fp16 and bf16 training
 amp_master_grad: false # Whether to use float32 weight gradients for master weights in amp opt level=’O2’
-amp_custom_black_list: "reduce_sum softmax_with_cross_entropy c_softmax_with_cross_entropy elementwise_div sin cos" # Custom black list for amp
-amp_custom_white_list: "lookup_table lookup_table_v2 flash_attn matmul matmul_v2 fused_gemm_epilogue" # Custom white list for amp
+amp_custom_black_list: ["reduce_sum", "softmax_with_cross_entropy", "c_softmax_with_cross_entropy", "elementwise_div", "sin", "cos"] # Custom black list for amp
+amp_custom_white_list: ["lookup_table", "lookup_table_v2", "flash_attn", "matmul", "matmul_v2", "fused_gemm_epilogue"] # Custom white list for amp
 offload_level: "freeze_model" # Level of model offloading to pinned memory, supported values: freeze_model, train_model, optimizer
 release_grads: true # Whether to release gradients
 offload_optim: false # Whether to offload optimizer to pinned memory
Original file line number	Diff line number	Diff line change
`@@ -98,7 +98,7 @@ DINLINE half& assign_add(half& a, half b) {`
`98`	`98`	`}`
`99`	`99`	`DINLINE float& assign_add(float& a, float b) { return a += b; }`
`100`	`100`
`101`		`-#if (__CUDA_ARCH__ >= 800 \|\| !defined(__CUDA_ARCH__))`
	`101`	`+#if (!defined(__CUDA_ARCH__) \|\| __CUDA_ARCH__ >= 800)`
`102`	`102`	`DINLINE float upcast_s(nv_bfloat16 val) { return __bfloat162float(val); }`
`103`	`103`	`template <>`
`104`	`104`	`DINLINE nv_bfloat16 downcast_s(float val) {`