|
7 | 7 | "gradient_accumulation_steps": 4, |
8 | 8 | "per_device_eval_batch_size": 4, |
9 | 9 | "tensor_parallel_degree": 1, |
10 | | - "pipeline_parallel_degree": 4, |
| 10 | + "pipeline_parallel_degree": 1, |
11 | 11 | "sharding": "stage1", |
12 | 12 | "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate", |
13 | 13 | "sharding_parallel_config": "enable_overlap enable_tensor_fusion", |
14 | 14 | "tensor_parallel_config": "enable_mp_async_allreduce", |
15 | 15 | "pipeline_parallel_config": "enable_send_recv_overlap enable_split_backward", |
16 | | - "pipeline_schedule_mode": "VPP", |
17 | | - "virtual_pp_degree": 5, |
| 16 | + "pipeline_schedule_mode": "", |
| 17 | + "virtual_pp_degree": 1, |
18 | 18 | "sequence_parallel": 0, |
19 | 19 | "use_flash_attention": true, |
20 | 20 | "use_fused_rms_norm": true, |
|
51 | 51 | "recompute_granularity": "full", |
52 | 52 | "save_total_limit": 2, |
53 | 53 | "device": "gpu", |
54 | | - "to_static": true, |
55 | | - "enable_auto_parallel": true |
| 54 | + "to_static": false, |
| 55 | + "enable_auto_parallel": true, |
| 56 | + "num_hidden_layers": 10 |
56 | 57 | } |
0 commit comments