Skip to content

Commit 042179d

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleNLP into develop
2 parents c8818b5 + 1d74d62 commit 042179d

File tree

163 files changed

+8677
-819
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

163 files changed

+8677
-819
lines changed

.github/PULL_REQUEST_TEMPLATE.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,18 @@
11
<!-- Demo: https://github.com/PaddlePaddle/PaddleNLP/pull/26 -->
2+
#### Before submitting
3+
4+
- [ ] Lint code. If there are lint issues, please format the code first.
5+
6+
```shell
7+
# Install and register `pre-commit` in the project folder
8+
pip install pre-commit && pre-commit install
9+
10+
# Process previous code files separately
11+
pre-commit run --file XXXX.py
12+
```
13+
14+
- [ ] Add test cases into `tests` folder. If there are codecov issues, please add tests cases first.
15+
216
### PR types
317
<!-- One of [ New features | Bug fixes | Function optimization | Performance optimization | Breaking changes | Others ] -->
418

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,8 @@ mkdir -p llm/data && cd llm/data
204204
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.bin
205205
wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k.idx
206206
cd .. # change folder to PaddleNLP/llm
207-
python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_pretrain.py ./config/llama/pretrain_argument.json
207+
# 如需使用use_fused_rms_norm=true,需要前往slm/model_zoo/gpt-3/external_ops安装fused_ln
208+
python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_pretrain.py ./config/llama/pretrain_argument.json --use_fused_rms_norm false
208209
```
209210

210211
### 大模型 SFT 精调

csrc/gpu/append_attention.cu

Lines changed: 2 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
5959
const float quant_max_bound,
6060
const float quant_min_bound,
6161
const float out_linear_in_scale,
62-
const int encoder_block_shape_q,
63-
const int decoder_block_shape_q,
64-
const int max_partition_size,
65-
const int encoder_max_partition_size,
6662
const int speculate_max_draft_token_num,
6763
const bool causal,
6864
const bool speculate_decoder) {
@@ -76,7 +72,8 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
7672
int max_enc_len_this_time_data = max_enc_len_this_time.data<int>()[0];
7773
int max_dec_len_this_time_data = max_dec_len_this_time.data<int>()[0];
7874
int max_len_kv_data = max_len_kv.data<int>()[0];
79-
75+
const int encoder_block_shape_q = get_encoder_block_shape_q();
76+
const int decoder_block_shape_q = get_decoder_block_shape_q();
8077
auto main_stream = qkv.stream();
8178
static cudaEvent_t main_event;
8279
static cudaEvent_t decoder_event;
@@ -209,8 +206,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
209206
quant_max_bound,
210207
quant_min_bound,
211208
out_linear_in_scale,
212-
max_partition_size,
213-
encoder_max_partition_size,
214209
speculate_max_draft_token_num,
215210
causal,
216211
false,
@@ -248,8 +243,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
248243
quant_max_bound,
249244
quant_min_bound,
250245
out_linear_in_scale,
251-
max_partition_size,
252-
encoder_max_partition_size,
253246
speculate_max_draft_token_num,
254247
causal,
255248
false,
@@ -292,8 +285,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
292285
quant_max_bound,
293286
quant_min_bound,
294287
out_linear_in_scale,
295-
max_partition_size,
296-
encoder_max_partition_size,
297288
speculate_max_draft_token_num,
298289
causal,
299290
false,
@@ -440,8 +431,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
440431
quant_max_bound,
441432
quant_min_bound,
442433
out_linear_in_scale,
443-
max_partition_size,
444-
encoder_max_partition_size,
445434
speculate_max_draft_token_num,
446435
causal,
447436
!speculate_decoder,
@@ -479,8 +468,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
479468
quant_max_bound,
480469
quant_min_bound,
481470
out_linear_in_scale,
482-
max_partition_size,
483-
encoder_max_partition_size,
484471
speculate_max_draft_token_num,
485472
causal,
486473
!speculate_decoder,
@@ -524,8 +511,6 @@ std::vector<paddle::Tensor> AppendAttentionKernel(
524511
quant_max_bound,
525512
quant_min_bound,
526513
out_linear_in_scale,
527-
max_partition_size,
528-
encoder_max_partition_size,
529514
speculate_max_draft_token_num,
530515
causal,
531516
!speculate_decoder,
@@ -583,10 +568,6 @@ std::vector<paddle::Tensor> AppendAttention(
583568
const float quant_max_bound,
584569
const float quant_min_bound,
585570
const float out_linear_in_scale,
586-
const int encoder_block_shape_q,
587-
const int decoder_block_shape_q,
588-
const int max_partition_size,
589-
const int encoder_max_partition_size,
590571
const int speculate_max_draft_token_num,
591572
const bool causal,
592573
const bool speculate_decoder) {
@@ -648,10 +629,6 @@ std::vector<paddle::Tensor> AppendAttention(
648629
quant_max_bound,
649630
quant_min_bound,
650631
out_linear_in_scale,
651-
encoder_block_shape_q,
652-
decoder_block_shape_q,
653-
max_partition_size,
654-
encoder_max_partition_size,
655632
speculate_max_draft_token_num,
656633
causal,
657634
speculate_decoder);
@@ -698,10 +675,6 @@ std::vector<paddle::Tensor> AppendAttention(
698675
quant_max_bound,
699676
quant_min_bound,
700677
out_linear_in_scale,
701-
encoder_block_shape_q,
702-
decoder_block_shape_q,
703-
max_partition_size,
704-
encoder_max_partition_size,
705678
speculate_max_draft_token_num,
706679
causal,
707680
speculate_decoder);
@@ -749,10 +722,6 @@ std::vector<paddle::Tensor> AppendAttention(
749722
quant_max_bound,
750723
quant_min_bound,
751724
out_linear_in_scale,
752-
encoder_block_shape_q,
753-
decoder_block_shape_q,
754-
max_partition_size,
755-
encoder_max_partition_size,
756725
speculate_max_draft_token_num,
757726
causal,
758727
speculate_decoder);
@@ -798,10 +767,6 @@ std::vector<paddle::Tensor> AppendAttention(
798767
quant_max_bound,
799768
quant_min_bound,
800769
out_linear_in_scale,
801-
encoder_block_shape_q,
802-
decoder_block_shape_q,
803-
max_partition_size,
804-
encoder_max_partition_size,
805770
speculate_max_draft_token_num,
806771
causal,
807772
speculate_decoder);
@@ -903,10 +868,6 @@ std::vector<paddle::DataType> AppendAttentionInferDtype(
903868
const float quant_max_bound,
904869
const float quant_min_bound,
905870
const float out_linear_in_scale,
906-
const int encoder_block_shape_q,
907-
const int decoder_block_shape_q,
908-
const int max_partition_size,
909-
const int encoder_max_partition_size,
910871
const int speculate_max_draft_token_num,
911872
const bool causal,
912873
const bool speculate_decoder) {
@@ -983,10 +944,6 @@ PD_BUILD_OP(append_attention)
983944
"quant_max_bound: float",
984945
"quant_min_bound: float",
985946
"out_linear_in_scale: float",
986-
"encoder_block_shape_q: int",
987-
"decoder_block_shape_q: int",
988-
"max_partition_size: int",
989-
"encoder_max_partition_size: int",
990947
"speculate_max_draft_token_num: int",
991948
"causal: bool",
992949
"speculate_decoder: bool"})

csrc/gpu/append_attn/append_attention_c16_impl.cuh

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -786,8 +786,6 @@ void MultiQueryAppendAttention(
786786
const float quant_max_bound,
787787
const float quant_min_bound,
788788
const float in_scale,
789-
const int max_partition_size,
790-
const int encoder_max_partition_size,
791789
const int speculate_max_draft_token_num,
792790
const bool is_decoder,
793791
cudaStream_t &stream,
@@ -839,9 +837,9 @@ void MultiQueryAppendAttention(
839837
int sm_count;
840838
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
841839

842-
uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
840+
uint32_t chunk_size = get_max_partition_size(bsz);
843841
if (!is_decoder) {
844-
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
842+
chunk_size = max_seq_len;
845843
}
846844
const int num_chunks = div_up(max_dec_len, chunk_size);
847845
dim3 grids(num_blocks_x_cpu, num_chunks, kv_num_heads);
@@ -1058,9 +1056,9 @@ void MultiQueryAppendAttention(
10581056
int sm_count;
10591057
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
10601058

1061-
uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
1059+
uint32_t chunk_size = get_max_partition_size(bsz);
10621060
if (!is_decoder) {
1063-
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
1061+
chunk_size = max_seq_len;
10641062
}
10651063
const int num_chunks = div_up(max_dec_len, chunk_size);
10661064

@@ -1301,8 +1299,6 @@ void CascadeAppendAttentionC16Kernel(
13011299
const float quant_max_bound,
13021300
const float quant_min_bound,
13031301
const float in_scale,
1304-
const int max_partition_size,
1305-
const int encoder_max_partition_size,
13061302
const int speculate_max_draft_token_num,
13071303
const bool causal,
13081304
const bool is_decoder,
@@ -1363,8 +1359,6 @@ void CascadeAppendAttentionC16Kernel(
13631359
quant_max_bound,
13641360
quant_min_bound,
13651361
in_scale,
1366-
max_partition_size,
1367-
encoder_max_partition_size,
13681362
speculate_max_draft_token_num,
13691363
is_decoder,
13701364
stream,

csrc/gpu/append_attn/append_attention_c4_impl.cuh

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -973,8 +973,6 @@ void MultiQueryAppendC4Attention(
973973
const float quant_max_bound,
974974
const float quant_min_bound,
975975
const float in_scale,
976-
const int max_partition_size,
977-
const int encoder_max_partition_size,
978976
const int speculate_max_draft_token_num,
979977
const bool is_decoder,
980978
cudaStream_t &stream,
@@ -1036,9 +1034,9 @@ void MultiQueryAppendC4Attention(
10361034
const float ratio = static_cast<float>(num_blocks_need) /
10371035
static_cast<float>(num_blocks_per_wave);
10381036

1039-
uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
1037+
uint32_t chunk_size = get_max_partition_size(bsz);
10401038
if (!is_decoder) {
1041-
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
1039+
chunk_size = max_seq_len;
10421040
}
10431041
const int num_chunks = div_up(max_dec_len, chunk_size);
10441042

@@ -1282,9 +1280,9 @@ void MultiQueryAppendC4Attention(
12821280
static_cast<float>(num_blocks_per_wave);
12831281

12841282

1285-
uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
1283+
uint32_t chunk_size = get_max_partition_size(bsz);
12861284
if (!is_decoder) {
1287-
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
1285+
chunk_size = max_seq_len;
12881286
}
12891287
const int num_chunks = div_up(max_dec_len, chunk_size);
12901288
dim3 grids(num_blocks_x_cpu, num_chunks, kv_num_heads);
@@ -1538,8 +1536,6 @@ void CascadeAppendAttentionC4Kernel(
15381536
const float quant_max_bound,
15391537
const float quant_min_bound,
15401538
const float in_scale,
1541-
const int max_partition_size,
1542-
const int encoder_max_partition_size,
15431539
const int speculate_max_draft_token_num,
15441540
const bool causal,
15451541
const bool is_decoder,
@@ -1604,8 +1600,6 @@ void CascadeAppendAttentionC4Kernel(
16041600
quant_max_bound,
16051601
quant_min_bound,
16061602
in_scale,
1607-
max_partition_size,
1608-
encoder_max_partition_size,
16091603
speculate_max_draft_token_num,
16101604
is_decoder,
16111605
stream,

csrc/gpu/append_attn/append_attention_c8_impl.cuh

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -860,8 +860,6 @@ void MultiQueryAppendC8Attention(
860860
const float quant_max_bound,
861861
const float quant_min_bound,
862862
const float in_scale,
863-
const int max_partition_size,
864-
const int encoder_max_partition_size,
865863
const int speculate_max_draft_token_num,
866864
const bool is_decoder,
867865
cudaStream_t &stream,
@@ -914,9 +912,9 @@ void MultiQueryAppendC8Attention(
914912
const int dev_id = 0;
915913
int sm_count;
916914
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
917-
uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
915+
uint32_t chunk_size = get_max_partition_size(bsz);
918916
if (!is_decoder) {
919-
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
917+
chunk_size = max_seq_len;
920918
}
921919
const int num_chunks = div_up(max_dec_len, chunk_size);
922920
dim3 grids(num_blocks_x_cpu, num_chunks, kv_num_heads);
@@ -1136,9 +1134,9 @@ void MultiQueryAppendC8Attention(
11361134
const int dev_id = 0;
11371135
int sm_count;
11381136
cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
1139-
uint32_t chunk_size = static_cast<uint32_t>(max_partition_size);
1137+
uint32_t chunk_size = get_max_partition_size(bsz);
11401138
if (!is_decoder) {
1141-
chunk_size = static_cast<uint32_t>(encoder_max_partition_size);
1139+
chunk_size = max_seq_len;
11421140
}
11431141

11441142
const int num_chunks = div_up(max_dec_len, chunk_size);
@@ -1377,8 +1375,6 @@ void CascadeAppendAttentionC8Kernel(
13771375
const float quant_max_bound,
13781376
const float quant_min_bound,
13791377
const float in_scale,
1380-
const int max_partition_size,
1381-
const int encoder_max_partition_size,
13821378
const int speculate_max_draft_token_num,
13831379
const bool causal,
13841380
const bool is_decoder,
@@ -1441,8 +1437,6 @@ void CascadeAppendAttentionC8Kernel(
14411437
quant_max_bound,
14421438
quant_min_bound,
14431439
in_scale,
1444-
max_partition_size,
1445-
encoder_max_partition_size,
14461440
speculate_max_draft_token_num,
14471441
is_decoder,
14481442
stream,

0 commit comments

Comments
 (0)