File tree Expand file tree Collapse file tree 1 file changed +4
-5
lines changed
ci/L0_backend_vllm/vllm_backend Expand file tree Collapse file tree 1 file changed +4
-5
lines changed Original file line number Diff line number Diff line change @@ -114,23 +114,22 @@ if [[ "$COUNT" -ne 2 ]]; then
114
114
echo " Cmdline parameters verification Failed"
115
115
fi
116
116
117
- # Test loading multiple vllm models at the same time
117
+ # Test loading multiple vllm models
118
118
SERVER_ARGS=" --model-repository=$( pwd) /models --backend-directory=${BACKEND_DIR} --model-control-mode=explicit --load-model=vllm_one"
119
119
SERVER_LOG=" ./vllm_test_multi_model.log"
120
120
121
121
# Create two models, one is just a copy of the other, and make sure gpu
122
122
# utilization is low enough for multiple models to avoid OOM.
123
123
# vLLM changed behavior of their GPU profiler from total to free memory,
124
- # so to load two small models at the same time, we need to start
125
- # triton server in explicit mode, load first model with
126
- # `gpu_memory_utilization` 0.4 and second should be 0.9.
124
+ # so to load two small models, we need to start
125
+ # triton server in explicit mode.
127
126
MODEL1=" vllm_one"
128
127
MODEL2=" vllm_two"
129
128
mkdir -p models
130
129
cp -r ${SAMPLE_MODELS_REPO} /vllm_model models/${MODEL1} /
131
130
cp -r models/${MODEL1} models/${MODEL2}
132
131
sed -i ' s/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/${MODEL1} /1/model.json
133
- sed -i ' s/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.9 /' models/${MODEL2} /1/model.json
132
+ sed -i ' s/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4 /' models/${MODEL2} /1/model.json
134
133
135
134
run_server
136
135
if [ " $SERVER_PID " == " 0" ]; then
You can’t perform that action at this time.
0 commit comments