run llama13b custom op perf

waliwali777 · waliwali777 · commit 5f74e510e603 · 2025-05-20T13:34:49.000+08:00
diff --git a/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_13b/pretrain-llama2_13b.json b/tests/test_tipc/static/auto_parallel/llama2/pretrain_config_llama2_13b/pretrain-llama2_13b.json
@@ -7,14 +7,14 @@
   "gradient_accumulation_steps": 4,
   "per_device_eval_batch_size": 4,
   "tensor_parallel_degree": 1,
-  "pipeline_parallel_degree": 4,
+  "pipeline_parallel_degree": 1,
   "sharding": "stage1",
   "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate",
   "sharding_parallel_config": "enable_overlap enable_tensor_fusion",
   "tensor_parallel_config": "enable_mp_async_allreduce",
   "pipeline_parallel_config": "enable_send_recv_overlap enable_split_backward",
-  "pipeline_schedule_mode": "VPP", 
-  "virtual_pp_degree": 5,
+  "pipeline_schedule_mode": "",
+  "virtual_pp_degree": 1,
   "sequence_parallel": 0,   
   "use_flash_attention": true,
   "use_fused_rms_norm": true,
@@ -51,6 +51,7 @@
   "recompute_granularity": "full",
   "save_total_limit": 2,
   "device": "gpu",
-  "to_static": true,
-  "enable_auto_parallel": true
+  "to_static": false,
+  "enable_auto_parallel": true,
+  "num_hidden_layers": 10
 }