Skip to content

Commit 9ba4e7e

Browse files
authored
Remove num_hidden_layers for baichuan (#10655)
* remove num_hidden_layers for baichuan * add enable_linear_fused_grad_add * remove fused_linear_param_grad_add * add tensor_fusion * add fused_linear_param_grad_add
1 parent d67eaa4 commit 9ba4e7e

File tree

2 files changed

+7
-5
lines changed

2 files changed

+7
-5
lines changed

tests/test_tipc/static/auto_parallel/baichuan2/N4C32/baichuan-inc-baichuan-2-13b_pretrain_dynamic_auto_bs32_bf16_DP1_MP4_PP1_Sharding8_Stage1.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ param+="model_type=baichuan2_13b "
2121
param+='dynamic_auto=_dynamic_auto '
2222

2323
export FLAGS_fuse_reducescatter_in_opt=1
24+
export FLAGS_enable_sharding_overlap=1
25+
export FLAGS_enable_tensor_fusion=1
2426

2527
cd ./tests
2628
bash ./test_tipc/static/auto_parallel/baichuan2/benchmark_common/prepare.sh

tests/test_tipc/static/auto_parallel/baichuan2/pretrain_config_baichuan2_13b/pretrain-baichuan2_13b_dynamic_auto.json

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,25 +32,25 @@
3232
"per_device_eval_batch_size": 1,
3333
"recompute": false,
3434
"recompute_use_reentrant": true,
35-
"recompute_granularity": "full",
35+
"recompute_granularity": "full_attn",
3636
"pp_recompute_interval": 0,
3737
"bf16": true,
3838
"fp16_opt_level": "O2",
3939
"amp_master_grad": true,
4040
"fuse_attention_ffn": true,
4141
"fuse_attention_qkv": true,
4242
"use_flash_attention": true,
43-
"fused_linear": 1,
43+
"fused_linear": true,
4444
"fused_linear_param_grad_add": 1,
45+
"enable_linear_fused_grad_add": true,
4546
"use_fused_rope": true,
4647
"use_fused_rms_norm": true,
4748
"max_seq_length": 4096,
48-
"sequence_parallel": 1,
49+
"sequence_parallel": true,
4950
"sharding": "stage1",
5051
"sharding_parallel_degree": 8,
5152
"sharding_parallel_config": "enable_tensor_fusion enable_overlap",
5253
"tensor_parallel_config": "enable_mp_async_allreduce replace_with_parallel_cross_entropy",
5354
"data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate",
54-
"pipeline_parallel_config": "enable_send_recv_overlap enable_split_backward",
55-
"num_hidden_layers": 20
55+
"pipeline_parallel_config": "enable_send_recv_overlap enable_split_backward"
5656
}

0 commit comments

Comments
 (0)