File tree Expand file tree Collapse file tree 1 file changed +14
-4
lines changed Expand file tree Collapse file tree 1 file changed +14
-4
lines changed Original file line number Diff line number Diff line change @@ -264,12 +264,21 @@ async def generate(self, request):
264
264
self .logger .log_info ("[vllm] Successfully cancelled the request" )
265
265
break
266
266
if stream :
267
- response_sender .send (self .create_response (output ))
267
+ if output .finished :
268
+ response_sender .send (
269
+ self .create_response (output ),
270
+ flags = pb_utils .TRITONSERVER_RESPONSE_COMPLETE_FINAL ,
271
+ )
272
+ else :
273
+ response_sender .send (self .create_response (output ))
268
274
else :
269
275
last_output = output
270
276
271
277
if not stream :
272
- response_sender .send (self .create_response (last_output ))
278
+ response_sender .send (
279
+ self .create_response (last_output ),
280
+ flags = pb_utils .TRITONSERVER_RESPONSE_COMPLETE_FINAL ,
281
+ )
273
282
274
283
except Exception as e :
275
284
self .logger .log_info (f"[vllm] Error generating stream: { e } " )
@@ -280,10 +289,11 @@ async def generate(self, request):
280
289
response = pb_utils .InferenceResponse (
281
290
output_tensors = [triton_output_tensor ], error = error
282
291
)
283
- response_sender .send (response )
292
+ response_sender .send (
293
+ response , flags = pb_utils .TRITONSERVER_RESPONSE_COMPLETE_FINAL
294
+ )
284
295
raise e
285
296
finally :
286
- response_sender .send (flags = pb_utils .TRITONSERVER_RESPONSE_COMPLETE_FINAL )
287
297
self .ongoing_request_count -= 1
288
298
289
299
def execute (self , requests ):
You can’t perform that action at this time.
0 commit comments