Skip to content

Commit 657954c

Browse files
authored
Updated vllm version to v0.4.0.post1 (#38)
1 parent c1c88fa commit 657954c

File tree

1 file changed

+4
-5
lines changed
  • ci/L0_backend_vllm/vllm_backend

1 file changed

+4
-5
lines changed

ci/L0_backend_vllm/vllm_backend/test.sh

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -114,23 +114,22 @@ if [[ "$COUNT" -ne 2 ]]; then
114114
echo "Cmdline parameters verification Failed"
115115
fi
116116

117-
# Test loading multiple vllm models at the same time
117+
# Test loading multiple vllm models
118118
SERVER_ARGS="--model-repository=$(pwd)/models --backend-directory=${BACKEND_DIR} --model-control-mode=explicit --load-model=vllm_one"
119119
SERVER_LOG="./vllm_test_multi_model.log"
120120

121121
# Create two models, one is just a copy of the other, and make sure gpu
122122
# utilization is low enough for multiple models to avoid OOM.
123123
# vLLM changed behavior of their GPU profiler from total to free memory,
124-
# so to load two small models at the same time, we need to start
125-
# triton server in explicit mode, load first model with
126-
# `gpu_memory_utilization` 0.4 and second should be 0.9.
124+
# so to load two small models, we need to start
125+
# triton server in explicit mode.
127126
MODEL1="vllm_one"
128127
MODEL2="vllm_two"
129128
mkdir -p models
130129
cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/${MODEL1}/
131130
cp -r models/${MODEL1} models/${MODEL2}
132131
sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/${MODEL1}/1/model.json
133-
sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.9/' models/${MODEL2}/1/model.json
132+
sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/${MODEL2}/1/model.json
134133

135134
run_server
136135
if [ "$SERVER_PID" == "0" ]; then

0 commit comments

Comments
 (0)