Skip to content

Commit bdf7fe0

Browse files
committed
fix the conflict
2 parents ed47c7e + 759ae99 commit bdf7fe0

File tree

39 files changed

+948
-546
lines changed

39 files changed

+948
-546
lines changed

csrc/gpu/all_reduce.cu

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,12 +72,14 @@ void all_reduce(fptr_t _fa, paddle::Tensor& inp, paddle::Tensor& out,
7272
reinterpret_cast<half*>(out.data()), out.numel());
7373
break;
7474
}
75+
#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800)
7576
case phi::DataType::BFLOAT16: {
7677
fa->allreduce<nv_bfloat16>(
7778
stream, reinterpret_cast<nv_bfloat16*>(reg_buffer),
7879
reinterpret_cast<nv_bfloat16*>(out.data()), out.numel());
7980
break;
8081
}
82+
#endif
8183
default:
8284
throw std::runtime_error(
8385
"custom allreduce only supports float32, float16 and bfloat16");

csrc/gpu/all_reduce.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ DINLINE half& assign_add(half& a, half b) {
9898
}
9999
DINLINE float& assign_add(float& a, float b) { return a += b; }
100100

101-
#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
101+
#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800)
102102
DINLINE float upcast_s(nv_bfloat16 val) { return __bfloat162float(val); }
103103
template <>
104104
DINLINE nv_bfloat16 downcast_s(float val) {

csrc/gpu/multi_head_latent_attention.cu

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,7 @@ std::vector<paddle::Tensor> MultiHeadLatentAttention(
205205
meta_data.batch_size = cum_offsets.dims()[0];
206206

207207
switch (query.dtype()) {
208+
#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800)
208209
case paddle::DataType::BFLOAT16: {
209210
return MultiHeadLatentAttentionKernel<paddle::DataType::BFLOAT16>(
210211
meta_data,
@@ -253,6 +254,7 @@ std::vector<paddle::Tensor> MultiHeadLatentAttention(
253254
causal,
254255
speculate_decoder);
255256
}
257+
#endif
256258
case paddle::DataType::FLOAT16: {
257259
return MultiHeadLatentAttentionKernel<paddle::DataType::FLOAT16>(
258260
meta_data,

csrc/setup_cuda.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -130,14 +130,11 @@ def get_gencode_flags():
130130
"./gpu/speculate_decoding_kernels/speculate_save_output.cc",
131131
"./gpu/speculate_decoding_kernels/speculate_get_output.cc",
132132
"./gpu/save_output_dygraph.cu",
133-
"./gpu/cpp_extensions.cu",
134133
"./gpu/all_reduce.cu",
135134
"./gpu/quantization/per_token_group_quant.cu",
136135
"./gpu/quantization/per_tensor_quant_fp8.cu",
137136
]
138137
sources += find_end_files("./gpu/speculate_decoding_kernels", ".cu")
139-
sources += find_end_files("./gpu/moe/fused_moe/cutlass_kernels/moe_gemm/", ".cu")
140-
sources += find_end_files("./gpu/moe/fused_moe/", ".cu")
141138

142139
nvcc_compile_args = gencode_flags
143140
update_git_submodule()
@@ -174,6 +171,9 @@ def get_gencode_flags():
174171

175172
sources += find_end_files("./gpu/append_attn", ".cu")
176173
sources += find_end_files("./gpu/append_attn/template_instantiation", ".cu")
174+
sources += find_end_files("./gpu/moe/fused_moe/cutlass_kernels/moe_gemm/", ".cu")
175+
sources += find_end_files("./gpu/moe/fused_moe/", ".cu")
176+
sources += "./gpu/cpp_extensions.cu",
177177

178178

179179
fp8_auto_gen_directory = "gpu/cutlass_kernels/fp8_gemm_fused/autogen"

csrc/tools/build_wheel.sh

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ function generate_sm_version(){
6161
sm_versions=($SM_VERSION )
6262
elif [ "$ARCHITECTURE" = "all" ]; then
6363
if awk -v version="$cuda_version" 'BEGIN { exit !(version >= 12.0) }'; then
64-
sm_versions=(70 75 80 80 86 89 90 )
64+
sm_versions=(70 75 80 86 89 90 )
6565
else
6666
sm_versions=(70 75 80 86 89 )
6767
fi
@@ -93,6 +93,7 @@ function create_directories(){
9393
9494
import os
9595
from datetime import datetime
96+
import paddle
9697
9798
from setuptools import find_packages, setup
9899
@@ -109,14 +110,24 @@ def read(file: str):
109110
content = f.read().strip()
110111
return content
111112
113+
def get_sm_version():
114+
prop = paddle.device.cuda.get_device_properties()
115+
cc = prop.major * 10 + prop.minor
116+
return cc
112117
113118
def read_version():
114119
"""
115120
read version and return content
116121
"""
117122
__version__ = "3.0.0b4.post"
123+
118124
formatted_date = datetime.now().date().strftime("%Y%m%d")
119-
__version__ = __version__.replace(".post", ".post{}".format(formatted_date))
125+
cuda_version = float(paddle.version.cuda())
126+
sm_version = get_sm_version()
127+
paddle_commit = paddle.__git_commit__[:7]
128+
build_tag = "{}+cuda{}sm{}paddle{}".format(formatted_date, cuda_version, sm_version, paddle_commit)
129+
130+
__version__ = __version__.replace(".post", ".post{}".format(build_tag))
120131
121132
return __version__
122133

llm/config/llama/grpo_argument.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,8 @@ recompute_granularity: "full" # Granularity of recompute
108108
bf16: true # Whether to use mixed precision with bfloat16
109109
fp16_opt_level: "O2" # Optimization level for fp16 and bf16 training
110110
amp_master_grad: false # Whether to use float32 weight gradients for master weights in amp opt level=’O2’
111-
amp_custom_black_list: "reduce_sum softmax_with_cross_entropy c_softmax_with_cross_entropy elementwise_div sin cos" # Custom black list for amp
112-
amp_custom_white_list: "lookup_table lookup_table_v2 flash_attn matmul matmul_v2 fused_gemm_epilogue" # Custom white list for amp
111+
amp_custom_black_list: ["reduce_sum", "softmax_with_cross_entropy", "c_softmax_with_cross_entropy", "elementwise_div", "sin", "cos"] # Custom black list for amp
112+
amp_custom_white_list: ["lookup_table", "lookup_table_v2", "flash_attn", "matmul", "matmul_v2", "fused_gemm_epilogue"] # Custom white list for amp
113113
offload_level: "freeze_model" # Level of model offloading to pinned memory, supported values: freeze_model, train_model, optimizer
114114
release_grads: true # Whether to release gradients
115115
offload_optim: false # Whether to offload optimizer to pinned memory

llm/config/qwen/grpo_32b_argument.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,8 @@ recompute_granularity: "full" # Granularity of recompute
109109
bf16: true # Whether to use mixed precision with bfloat16
110110
fp16_opt_level: "O2" # Optimization level for fp16 and bf16 training
111111
amp_master_grad: false # Whether to use float32 weight gradients for master weights in amp opt level=’O2’
112-
amp_custom_black_list: "reduce_sum softmax_with_cross_entropy c_softmax_with_cross_entropy elementwise_div sin cos" # Custom black list for amp
113-
amp_custom_white_list: "lookup_table lookup_table_v2 flash_attn matmul matmul_v2 fused_gemm_epilogue" # Custom white list for amp
112+
amp_custom_black_list: ["reduce_sum", "softmax_with_cross_entropy", "c_softmax_with_cross_entropy", "elementwise_div", "sin", "cos"] # Custom black list for amp
113+
amp_custom_white_list: ["lookup_table", "lookup_table_v2", "flash_attn", "matmul", "matmul_v2", "fused_gemm_epilogue"] # Custom white list for amp
114114
offload_level: "freeze_model" # Level of model offloading to pinned memory, supported values: freeze_model, train_model, optimizer
115115
release_grads: true # Whether to release gradients
116116
offload_optim: true # Whether to offload optimizer to pinned memory

llm/config/qwen/grpo_argument.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,8 @@ recompute_granularity: "full" # Granularity of recompute
109109
bf16: true # Whether to use mixed precision with bfloat16
110110
fp16_opt_level: "O2" # Optimization level for fp16 and bf16 training
111111
amp_master_grad: false # Whether to use float32 weight gradients for master weights in amp opt level=’O2’
112-
amp_custom_black_list: "reduce_sum softmax_with_cross_entropy c_softmax_with_cross_entropy elementwise_div sin cos" # Custom black list for amp
113-
amp_custom_white_list: "lookup_table lookup_table_v2 flash_attn matmul matmul_v2 fused_gemm_epilogue" # Custom white list for amp
112+
amp_custom_black_list: ["reduce_sum", "softmax_with_cross_entropy", "c_softmax_with_cross_entropy", "elementwise_div", "sin", "cos"] # Custom black list for amp
113+
amp_custom_white_list: ["lookup_table", "lookup_table_v2", "flash_attn", "matmul", "matmul_v2", "fused_gemm_epilogue"] # Custom white list for amp
114114
offload_level: "freeze_model" # Level of model offloading to pinned memory, supported values: freeze_model, train_model, optimizer
115115
release_grads: true # Whether to release gradients
116116
offload_optim: false # Whether to offload optimizer to pinned memory

llm/config/qwen/reinforce_plus_plus_argument.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,8 @@ recompute_granularity: "full" # Granularity of recompute
109109
bf16: true # Whether to use mixed precision with bfloat16
110110
fp16_opt_level: "O2" # Optimization level for fp16 and bf16 training
111111
amp_master_grad: false # Whether to use float32 weight gradients for master weights in amp opt level=’O2’
112-
amp_custom_black_list: "reduce_sum softmax_with_cross_entropy c_softmax_with_cross_entropy elementwise_div sin cos" # Custom black list for amp
113-
amp_custom_white_list: "lookup_table lookup_table_v2 flash_attn matmul matmul_v2 fused_gemm_epilogue" # Custom white list for amp
112+
amp_custom_black_list: ["reduce_sum", "softmax_with_cross_entropy", "c_softmax_with_cross_entropy", "elementwise_div", "sin", "cos"] # Custom black list for amp
113+
amp_custom_white_list: ["lookup_table", "lookup_table_v2", "flash_attn", "matmul", "matmul_v2", "fused_gemm_epilogue"] # Custom white list for amp
114114
offload_level: "freeze_model" # Level of model offloading to pinned memory, supported values: freeze_model, train_model, optimizer
115115
release_grads: true # Whether to release gradients
116116
offload_optim: false # Whether to offload optimizer to pinned memory

0 commit comments

Comments
 (0)