@@ -174,6 +174,9 @@ class ScriptArguments:
174
174
enable_prefix_caching (`bool` or `None`, *optional*, defaults to `None`):
175
175
Whether to enable prefix caching in vLLM. If set to `True`, ensure that the model and the hardware support
176
176
this feature.
177
+ enforce_eager (`bool` or `None`, *optional*, defaults to `None`):
178
+ Whether to enforce eager execution. If set to `True`, we will disable CUDA graph and always execute the
179
+ model in eager mode. If `False` (default behavior), we will use CUDA graph and eager execution in hybrid.
177
180
"""
178
181
179
182
model : str = field (metadata = {"help" : "Model name or path to load the model from." })
@@ -224,6 +227,14 @@ class ScriptArguments:
224
227
"hardware support this feature."
225
228
},
226
229
)
230
+ enforce_eager : Optional [bool ] = field (
231
+ default = None ,
232
+ metadata = {
233
+ "help" : "Whether to enforce eager execution. If set to `True`, we will disable CUDA graph and always "
234
+ "execute the model in eager mode. If `False` (default behavior), we will use CUDA graph and eager "
235
+ "execution in hybrid."
236
+ },
237
+ )
227
238
228
239
229
240
def main (script_args : ScriptArguments ):
@@ -250,6 +261,7 @@ def main(script_args: ScriptArguments):
250
261
revision = script_args .revision ,
251
262
tensor_parallel_size = script_args .tensor_parallel_size ,
252
263
gpu_memory_utilization = script_args .gpu_memory_utilization ,
264
+ enforce_eager = script_args .enforce_eager ,
253
265
dtype = script_args .dtype ,
254
266
# Automatic Prefix Caching caches the KV cache of existing queries, so that a new query can
255
267
# directly reuse the KV cache if it shares the same prefix with one of the existing queries.
0 commit comments