Skip to content

Commit 677847b

Browse files
committed
rm VLLM_ALLOW_LONG_MAX_MODEL_LEN
1 parent 66f6fbd commit 677847b

File tree

6 files changed

+6
-30
lines changed

6 files changed

+6
-30
lines changed

.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
{
33
"test_name": "latency_llama8B_tp1",
44
"environment_variables": {
5-
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
65
"VLLM_CPU_KVCACHE_SPACE": 40
76
},
87
"parameters": {
@@ -16,7 +15,6 @@
1615
{
1716
"test_name": "latency_llama8B_tp4",
1817
"environment_variables": {
19-
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
2018
"VLLM_CPU_KVCACHE_SPACE": 40
2119
},
2220
"parameters": {

.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
"qps_list": [1, 4, 16, "inf"],
55
"server_environment_variables": {
66
"VLLM_RPC_TIMEOUT": 100000,
7-
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
87
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
98
"VLLM_CPU_KVCACHE_SPACE": 40
109
},
@@ -34,7 +33,6 @@
3433
"qps_list": [1, 4, 16, "inf"],
3534
"server_environment_variables": {
3635
"VLLM_RPC_TIMEOUT": 100000,
37-
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
3836
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
3937
"VLLM_CPU_KVCACHE_SPACE": 40
4038
},
@@ -64,7 +62,6 @@
6462
"qps_list": [1, 4, 16, "inf"],
6563
"server_environment_variables": {
6664
"VLLM_RPC_TIMEOUT": 100000,
67-
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
6865
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
6966
"VLLM_CPU_KVCACHE_SPACE": 40
7067
},
@@ -94,7 +91,6 @@
9491
"qps_list": [1, 4, 16, "inf"],
9592
"server_environment_variables": {
9693
"VLLM_RPC_TIMEOUT": 100000,
97-
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
9894
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
9995
"VLLM_CPU_KVCACHE_SPACE": 40
10096
},
@@ -127,7 +123,6 @@
127123
"qps_list": [1, 4, 16, "inf"],
128124
"server_environment_variables": {
129125
"VLLM_RPC_TIMEOUT": 100000,
130-
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
131126
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
132127
"VLLM_CPU_KVCACHE_SPACE": 40
133128
},

.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
{
33
"test_name": "throughput_llama8B_tp1",
44
"environment_variables": {
5-
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
65
"VLLM_CPU_KVCACHE_SPACE": 40
76
},
87
"parameters": {
@@ -17,7 +16,6 @@
1716
{
1817
"test_name": "throughput_llama8B_tp4",
1918
"environment_variables": {
20-
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
2119
"VLLM_CPU_KVCACHE_SPACE": 40
2220
},
2321
"parameters": {

examples/offline_inference/qwen_1m.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from vllm import LLM, SamplingParams
77

88
os.environ["VLLM_ATTENTION_BACKEND"] = "DUAL_CHUNK_FLASH_ATTN"
9-
os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
109

1110

1211
def load_prompt() -> str:
@@ -49,14 +48,15 @@ def process_requests(llm: LLM, prompts: list[str]) -> None:
4948

5049

5150
# Create an LLM.
52-
def initialize_engine() -> LLM:
51+
def initialize_engine(max_model_len=1048576) -> LLM:
5352
llm = LLM(
5453
model="Qwen/Qwen2.5-7B-Instruct-1M",
55-
max_model_len=1048576,
54+
max_model_len=max_model_len,
5655
tensor_parallel_size=4,
5756
enforce_eager=True,
5857
enable_chunked_prefill=True,
5958
max_num_batched_tokens=131072,
59+
hf_overrides={"max_position_embeddings": max_model_len}
6060
)
6161
return llm
6262

vllm/config.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3630,14 +3630,9 @@ def _get_and_verify_max_len(
36303630
f"{derived_max_model_len} or model_max_length="
36313631
f"{model_max_length} in model's config.json). This may lead "
36323632
"to incorrect model outputs or CUDA errors.")
3633-
if envs.VLLM_ALLOW_LONG_MAX_MODEL_LEN:
3634-
logger.warning(
3635-
"%s Make sure the value is correct and within the "
3636-
"model context size.", msg)
3637-
else:
3638-
raise ValueError(
3639-
f"{msg} To allow overriding this maximum, set "
3640-
"the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1")
3633+
raise ValueError(
3634+
"%s Make sure the value is correct and within the "
3635+
"model context size.", msg)
36413636
return int(max_model_len)
36423637

36433638

vllm/envs.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@
7272
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
7373
CMAKE_BUILD_TYPE: Optional[str] = None
7474
VERBOSE: bool = False
75-
VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
7675
VLLM_RPC_TIMEOUT: int = 10000 # ms
7776
VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds
7877
VLLM_PLUGINS: Optional[list[str]] = None
@@ -564,15 +563,6 @@ def get_vllm_port() -> Optional[int]:
564563
"VLLM_KEEP_ALIVE_ON_ENGINE_DEATH":
565564
lambda: bool(os.getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", 0)),
566565

567-
# If the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN is set, it allows
568-
# the user to specify a max sequence length greater than
569-
# the max length derived from the model's config.json.
570-
# To enable this, set VLLM_ALLOW_LONG_MAX_MODEL_LEN=1.
571-
"VLLM_ALLOW_LONG_MAX_MODEL_LEN":
572-
lambda:
573-
(os.environ.get("VLLM_ALLOW_LONG_MAX_MODEL_LEN", "0").strip().lower() in
574-
("1", "true")),
575-
576566
# If set, forces FP8 Marlin to be used for FP8 quantization regardless
577567
# of the hardware support for FP8 compute.
578568
"VLLM_TEST_FORCE_FP8_MARLIN":

0 commit comments

Comments
 (0)