rm VLLM_ALLOW_LONG_MAX_MODEL_LEN

noooop · noooop · commit 677847bb3327 · 2025-07-14T13:34:38.000+08:00
diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
@@ -2,7 +2,6 @@
     {
         "test_name": "latency_llama8B_tp1",
         "environment_variables": {
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "parameters": {
@@ -16,7 +15,6 @@
     {
         "test_name": "latency_llama8B_tp4",
         "environment_variables": {
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "parameters": {
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
@@ -4,7 +4,6 @@
         "qps_list": [1, 4, 16, "inf"],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
@@ -34,7 +33,6 @@
         "qps_list": [1, 4, 16, "inf"],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
@@ -64,7 +62,6 @@
         "qps_list": [1, 4, 16, "inf"],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
@@ -94,7 +91,6 @@
         "qps_list": [1, 4, 16, "inf"],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
@@ -127,7 +123,6 @@
         "qps_list": [1, 4, 16, "inf"],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
@@ -2,7 +2,6 @@
     {
         "test_name": "throughput_llama8B_tp1",
         "environment_variables": {
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "parameters": {
@@ -17,7 +16,6 @@
     {
         "test_name": "throughput_llama8B_tp4",
         "environment_variables": {
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "parameters": {
diff --git a/examples/offline_inference/qwen_1m.py b/examples/offline_inference/qwen_1m.py
@@ -6,7 +6,6 @@
 from vllm import LLM, SamplingParams
 
 os.environ["VLLM_ATTENTION_BACKEND"] = "DUAL_CHUNK_FLASH_ATTN"
-os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
 
 
 def load_prompt() -> str:
@@ -49,14 +48,15 @@ def process_requests(llm: LLM, prompts: list[str]) -> None:
 
 
 # Create an LLM.
-def initialize_engine() -> LLM:
+def initialize_engine(max_model_len=1048576) -> LLM:
     llm = LLM(
         model="Qwen/Qwen2.5-7B-Instruct-1M",
-        max_model_len=1048576,
+        max_model_len=max_model_len,
         tensor_parallel_size=4,
         enforce_eager=True,
         enable_chunked_prefill=True,
         max_num_batched_tokens=131072,
+        hf_overrides={"max_position_embeddings": max_model_len}
     )
     return llm
 
diff --git a/vllm/config.py b/vllm/config.py
@@ -3630,14 +3630,9 @@ def _get_and_verify_max_len(
                 f"{derived_max_model_len} or model_max_length="
                 f"{model_max_length} in model's config.json). This may lead "
                 "to incorrect model outputs or CUDA errors.")
-            if envs.VLLM_ALLOW_LONG_MAX_MODEL_LEN:
-                logger.warning(
-                    "%s Make sure the value is correct and within the "
-                    "model context size.", msg)
-            else:
-                raise ValueError(
-                    f"{msg} To allow overriding this maximum, set "
-                    "the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1")
+            raise ValueError(
+                "%s Make sure the value is correct and within the "
+                "model context size.", msg)
     return int(max_model_len)
 
 
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -72,7 +72,6 @@
     VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
     CMAKE_BUILD_TYPE: Optional[str] = None
     VERBOSE: bool = False
-    VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
     VLLM_RPC_TIMEOUT: int = 10000  # ms
     VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5  # seconds
     VLLM_PLUGINS: Optional[list[str]] = None
@@ -564,15 +563,6 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH":
     lambda: bool(os.getenv("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH", 0)),
 
-    # If the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN is set, it allows
-    # the user to specify a max sequence length greater than
-    # the max length derived from the model's config.json.
-    # To enable this, set VLLM_ALLOW_LONG_MAX_MODEL_LEN=1.
-    "VLLM_ALLOW_LONG_MAX_MODEL_LEN":
-    lambda:
-    (os.environ.get("VLLM_ALLOW_LONG_MAX_MODEL_LEN", "0").strip().lower() in
-     ("1", "true")),
-
     # If set, forces FP8 Marlin to be used for FP8 quantization regardless
     # of the hardware support for FP8 compute.
     "VLLM_TEST_FORCE_FP8_MARLIN":