|
7 | 7 | "gradient_accumulation_steps": 4,
|
8 | 8 | "per_device_eval_batch_size": 4,
|
9 | 9 | "tensor_parallel_degree": 1,
|
10 |
| - "pipeline_parallel_degree": 4, |
| 10 | + "pipeline_parallel_degree": 1, |
11 | 11 | "sharding": "stage1",
|
12 | 12 | "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate",
|
13 | 13 | "sharding_parallel_config": "enable_overlap enable_tensor_fusion",
|
14 | 14 | "tensor_parallel_config": "enable_mp_async_allreduce",
|
15 | 15 | "pipeline_parallel_config": "enable_send_recv_overlap enable_split_backward",
|
16 |
| - "pipeline_schedule_mode": "VPP", |
17 |
| - "virtual_pp_degree": 5, |
| 16 | + "pipeline_schedule_mode": "", |
| 17 | + "virtual_pp_degree": 1, |
18 | 18 | "sequence_parallel": 0,
|
19 | 19 | "use_flash_attention": true,
|
20 | 20 | "use_fused_rms_norm": true,
|
|
51 | 51 | "recompute_granularity": "full",
|
52 | 52 | "save_total_limit": 2,
|
53 | 53 | "device": "gpu",
|
54 |
| - "to_static": true, |
55 |
| - "enable_auto_parallel": true |
| 54 | + "to_static": false, |
| 55 | + "enable_auto_parallel": true, |
| 56 | + "num_hidden_layers": 10 |
56 | 57 | }
|
0 commit comments