Skip to content

Commit 7c1c9ba

Browse files
authored
[LLM Benchmark]update scripts (#9722)
* add no_proxy & del paddlenlp_ops * update timeout for dpo * fix sequence_parallel * add timeout * add Total_Tokens_per_second_per_gpu * fix Tokens_per_second_per_gpu * update Total_Tokens_per_second_per_gpu
1 parent 730a762 commit 7c1c9ba

File tree

7 files changed

+30
-16
lines changed

7 files changed

+30
-16
lines changed

fix_time

Whitespace-only changes.

tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-70b/dpo.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
"max_seq_len": 4096,
1616
"max_prompt_len": 2048,
1717
"pipeline_parallel_config": "disable_partial_send_recv enable_clear_every_step_cache",
18-
"sequence_parallel": 1,
18+
"sequence_parallel": 0,
1919
"bf16": true,
2020
"fp16_opt_level": "O2",
2121
"do_train": true,

tests/test_tipc/llm/llama2/benchmark_common/prepare.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@ python setup.py install
2424
cd -
2525

2626
# install paddlenlp_ops
27-
cd ../csrc/
28-
python setup_cuda.py install
29-
cd -
27+
# cd ../csrc/
28+
# python setup_cuda.py install
29+
# cd -
3030

3131
cd ../llm
3232
cp -r ../tests/test_tipc/llm/llama2/benchmark_common/benchmark_json ./

tests/test_tipc/llm/llama2/benchmark_common/run_benchmark.sh

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ function _set_params(){
3636
skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step
3737
keyword="Effective_Tokens_per_second_per_gpu:" # (必选)解析日志,筛选出性能数据所在行的关键字
3838
is_large_model=True # (可选)普通模型默认为False,如果添加大模型且只取一条ips设置为True
39-
convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:"
39+
convergence_key="Total_Tokens_per_second_per_gpu:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:"
4040

4141
fp_item="bf16"
4242
# 以下为通用执行命令,无特殊可不用修改
@@ -105,18 +105,25 @@ function _train(){
105105
;;
106106
esac
107107
cd ../llm/
108+
export no_proxy=bcebos.com
108109
echo "train_cmd: ${train_cmd} log_file: ${log_file}"
109110
python -c "import paddlenlp"
110111
if [[ ${model_name_or_path} =~ "CE" ]];then # CE精度-不限制执行时间
111112
${train_cmd} > ${log_file} 2>&1
112113
else
113-
timeout 30m ${train_cmd} > ${log_file} 2>&1
114+
timeout 60m ${train_cmd} > ${log_file} 2>&1
114115
# echo ${train_cmd}
115116
Effective_Tokens_per_second=`cat ${log_file} | grep -E 'Effective_Tokens_per_second|Effective tokens per second:' \
116117
|awk -F': ' '{print $2}' |awk -F' ' '{print $1}'`
117118
num_gpu=$(echo "$device_num" | sed 's/^.*C//')
118-
ips=$(awk -v a="$Effective_Tokens_per_second" -v b="$num_gpu" 'BEGIN {printf "%.2f\n", a / b}')
119-
echo "Effective_Tokens_per_second_per_gpu: ${ips}" >> ${log_file}
119+
Effective_Tokens_per_second_per_gpu=$(awk -v a="$Effective_Tokens_per_second" -v b="$num_gpu" 'BEGIN {printf "%.2f\n", a / b}')
120+
echo "Effective_Tokens_per_second_per_gpu: ${Effective_Tokens_per_second_per_gpu}" >> ${log_file}
121+
Train_samples_per_second=`cat ${log_file} | grep 'train_samples_per_second' \
122+
|awk -F'train_samples_per_second: ' '{print $2}' |awk -F', ' '{print $1}'`
123+
length=4096
124+
Total_Tokens_per_second=$(awk -v a="$Train_samples_per_second" -v b="$length" 'BEGIN {printf "%.2f\n", a * b}')
125+
Total_Tokens_per_second_per_gpu=$(awk -v a="$Total_Tokens_per_second" -v b="$num_gpu" 'BEGIN {printf "%.2f\n", a / b}')
126+
echo "Total_Tokens_per_second_per_gpu: ${Total_Tokens_per_second_per_gpu}" >> ${log_file}
120127
fi
121128
if [ $? -ne 0 ];then
122129
echo -e "${model_name}, FAIL"

tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-72b/dpo.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
"max_seq_len": 4096,
1616
"max_prompt_len": 2048,
1717
"pipeline_parallel_config": "disable_partial_send_recv enable_clear_every_step_cache",
18-
"sequence_parallel": 1,
18+
"sequence_parallel": 0,
1919
"bf16": true,
2020
"fp16_opt_level": "O2",
2121
"do_train": true,

tests/test_tipc/llm/qwen2_5/benchmark_common/prepare.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@ python setup.py install
2424
cd -
2525

2626
# install paddlenlp_ops
27-
cd ../csrc/
28-
python setup_cuda.py install
29-
cd -
27+
# cd ../csrc/
28+
# python setup_cuda.py install
29+
# cd -
3030

3131
cd ../llm
3232
cp -r ../tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json ./

tests/test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ function _set_params(){
3636
skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step
3737
keyword="Effective_Tokens_per_second_per_gpu:" # (必选)解析日志,筛选出性能数据所在行的关键字
3838
is_large_model=True # (可选)普通模型默认为False,如果添加大模型且只取一条ips设置为True
39-
convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:"
39+
convergence_key="Total_Tokens_per_second_per_gpu:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:"
4040

4141
fp_item="bf16"
4242
# 以下为通用执行命令,无特殊可不用修改
@@ -105,18 +105,25 @@ function _train(){
105105
;;
106106
esac
107107
cd ../llm/
108+
export no_proxy=bcebos.com
108109
echo "train_cmd: ${train_cmd} log_file: ${log_file}"
109110
python -c "import paddlenlp"
110111
if [[ ${model_name_or_path} =~ "CE" ]];then # CE精度-不限制执行时间
111112
${train_cmd} > ${log_file} 2>&1
112113
else
113-
timeout 30m ${train_cmd} > ${log_file} 2>&1
114+
timeout 60m ${train_cmd} > ${log_file} 2>&1
114115
# echo ${train_cmd}
115116
Effective_Tokens_per_second=`cat ${log_file} | grep -E 'Effective_Tokens_per_second|Effective tokens per second:' \
116117
|awk -F': ' '{print $2}' |awk -F' ' '{print $1}'`
117118
num_gpu=$(echo "$device_num" | sed 's/^.*C//')
118-
ips=$(awk -v a="$Effective_Tokens_per_second" -v b="$num_gpu" 'BEGIN {printf "%.2f\n", a / b}')
119-
echo "Effective_Tokens_per_second_per_gpu: ${ips}" >> ${log_file}
119+
Effective_Tokens_per_second_per_gpu=$(awk -v a="$Effective_Tokens_per_second" -v b="$num_gpu" 'BEGIN {printf "%.2f\n", a / b}')
120+
echo "Effective_Tokens_per_second_per_gpu: ${Effective_Tokens_per_second_per_gpu}" >> ${log_file}
121+
Train_samples_per_second=`cat ${log_file} | grep 'train_samples_per_second' \
122+
|awk -F'train_samples_per_second: ' '{print $2}' |awk -F', ' '{print $1}'`
123+
length=4096
124+
Total_Tokens_per_second=$(awk -v a="$Train_samples_per_second" -v b="$length" 'BEGIN {printf "%.2f\n", a * b}')
125+
Total_Tokens_per_second_per_gpu=$(awk -v a="$Total_Tokens_per_second" -v b="$num_gpu" 'BEGIN {printf "%.2f\n", a / b}')
126+
echo "Total_Tokens_per_second_per_gpu: ${Total_Tokens_per_second_per_gpu}" >> ${log_file}
120127
fi
121128
if [ $? -ne 0 ];then
122129
echo -e "${model_name}, FAIL"

0 commit comments

Comments
 (0)