Skip to content

Commit 9cb617c

Browse files
committed
add external dp launcher
Signed-off-by: whx-sjtu <2952154980@qq.com>
1 parent 0df059f commit 9cb617c

File tree

2 files changed

+143
-0
lines changed

2 files changed

+143
-0
lines changed
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import argparse
2+
import multiprocessing
3+
import os
4+
import subprocess
5+
import sys
6+
7+
def parse_args():
8+
parser = argparse.ArgumentParser()
9+
parser.add_argument(
10+
"--dp-size",
11+
type=int,
12+
required=True,
13+
help="Data parallel size."
14+
)
15+
parser.add_argument(
16+
"--tp-size",
17+
type=int,
18+
default=1,
19+
help="Tensor parallel size."
20+
)
21+
parser.add_argument(
22+
"--dp-size-local",
23+
type=int,
24+
required=True,
25+
help="Local data parallel size."
26+
)
27+
parser.add_argument(
28+
"--dp-rank-start",
29+
type=int,
30+
default=0,
31+
help="Starting rank for data parallel."
32+
)
33+
parser.add_argument(
34+
"--dp-address",
35+
type=str,
36+
required=True,
37+
help="IP address for data parallel master node."
38+
)
39+
parser.add_argument(
40+
"--dp-rpc-port",
41+
type=str,
42+
required=True,
43+
help="Port for data parallel master node."
44+
)
45+
parser.add_argument(
46+
"--vllm-start-port",
47+
type=int,
48+
default=9000,
49+
help="Starting port for the engine."
50+
)
51+
return parser.parse_args()
52+
53+
args = parse_args()
54+
dp_size = args.dp_size
55+
tp_size = args.tp_size
56+
dp_size_local = args.dp_size_local
57+
dp_rank_start = args.dp_rank_start
58+
dp_address = args.dp_address
59+
dp_rpc_port = args.dp_rpc_port
60+
vllm_start_port = args.vllm_start_port
61+
62+
def run_command(visiable_devices, dp_rank, vllm_engine_port):
63+
command = [
64+
"bash",
65+
"./run_dp_template.sh",
66+
visiable_devices,
67+
str(vllm_engine_port),
68+
str(dp_size),
69+
str(dp_rank),
70+
dp_address,
71+
dp_rpc_port,
72+
str(tp_size),
73+
]
74+
subprocess.run(command, check=True)
75+
76+
if __name__ == "__main__":
77+
template_path = "./run_dp_template.sh"
78+
if not os.path.exists(template_path):
79+
print(f"Template file {template_path} does not exist.")
80+
sys.exit(1)
81+
82+
processes = []
83+
num_cards = dp_size_local * tp_size
84+
for i in range(dp_size_local):
85+
dp_rank = dp_rank_start + i
86+
vllm_engine_port = vllm_start_port + i
87+
visiable_devices = ",".join(str(x) for x in range(i * tp_size, (i + 1) * tp_size))
88+
process = multiprocessing.Process(target=run_command,
89+
args=(visiable_devices, dp_rank,
90+
vllm_engine_port))
91+
processes.append(process)
92+
process.start()
93+
94+
for process in processes:
95+
process.join()
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
export HCCL_IF_IP=your_ip_here
2+
export GLOO_SOCKET_IFNAME=your_socket_ifname_here
3+
export TP_SOCKET_IFNAME=your_socket_ifname_here
4+
export HCCL_SOCKET_IFNAME=your_socket_ifname_here
5+
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=your_rank_table_path_here
6+
export VLLM_LOGGING_LEVEL="info"
7+
export OMP_PROC_BIND=false
8+
export OMP_NUM_THREADS=10
9+
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
10+
export HCCL_DETERMINISTIC=True
11+
export HCCL_BUFFER_SIZE=1024
12+
export TASK_QUEUE_ENABLE=1
13+
# Spawn the process inside the vllm maybe cause the circular import issue, using fork here is necessary
14+
export VLLM_WORKER_MULTIPROC_METHOD="fork"
15+
16+
export VLLM_USE_V1=1
17+
18+
export ASCEND_RT_VISIBLE_DEVICES=$1
19+
20+
vllm serve model_path \
21+
--host 0.0.0.0 \
22+
--port $2 \
23+
--data-parallel-size $3 \
24+
--data-parallel-rank $4 \
25+
--data-parallel-address $5 \
26+
--data-parallel-rpc-port $6 \
27+
--tensor-parallel-size $7 \
28+
--enable-expert-parallel \
29+
--seed 1024 \
30+
--served-model-name dsv3 \
31+
--max-model-len 3500 \
32+
--max-num-batched-tokens 3500 \
33+
--max-num-seqs 28 \
34+
--trust-remote-code \
35+
--gpu-memory-utilization 0.9 \
36+
--quantization ascend \
37+
--speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
38+
--kv-transfer-config \
39+
'{"kv_connector": "LLMDataDistCMgrConnector",
40+
"kv_buffer_device": "npu",
41+
"kv_role": "kv_consumer",
42+
"kv_parallel_size": "1",
43+
"kv_port": "20001",
44+
"engine_id": "0",
45+
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
46+
}' \
47+
--additional-config \
48+
'{"ascend_scheduler_config": {"enabled": true}, "torchair_graph_config":{"enabled":true,"enable_kv_nz":false, "enable_multistream_moe":false, "graph_batch_size":[28]}, "enable_weight_nz_layout":true}'

0 commit comments

Comments
 (0)