Skip to content

Commit 52c1c3c

Browse files
authored
Piggyback final flag as a part of final response (#28)
1 parent 6e084cd commit 52c1c3c

File tree

1 file changed

+14
-4
lines changed

1 file changed

+14
-4
lines changed

src/model.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -264,12 +264,21 @@ async def generate(self, request):
264264
self.logger.log_info("[vllm] Successfully cancelled the request")
265265
break
266266
if stream:
267-
response_sender.send(self.create_response(output))
267+
if output.finished:
268+
response_sender.send(
269+
self.create_response(output),
270+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL,
271+
)
272+
else:
273+
response_sender.send(self.create_response(output))
268274
else:
269275
last_output = output
270276

271277
if not stream:
272-
response_sender.send(self.create_response(last_output))
278+
response_sender.send(
279+
self.create_response(last_output),
280+
flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL,
281+
)
273282

274283
except Exception as e:
275284
self.logger.log_info(f"[vllm] Error generating stream: {e}")
@@ -280,10 +289,11 @@ async def generate(self, request):
280289
response = pb_utils.InferenceResponse(
281290
output_tensors=[triton_output_tensor], error=error
282291
)
283-
response_sender.send(response)
292+
response_sender.send(
293+
response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
294+
)
284295
raise e
285296
finally:
286-
response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
287297
self.ongoing_request_count -= 1
288298

289299
def execute(self, requests):

0 commit comments

Comments
 (0)