Skip to content

Commit 5e9b09f

Browse files
committed
Revert "Return token ids instead of number of token ids"
This reverts commit 457eeaa.
1 parent 457eeaa commit 5e9b09f

File tree

3 files changed

+35
-30
lines changed

3 files changed

+35
-30
lines changed

ci/L0_additional_outputs_vllm/additional_outputs_test.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def _get_inputs(
4444
sampling_parameters=None,
4545
return_finish_reason=None,
4646
return_cumulative_logprob=None,
47-
return_token_ids=None,
47+
return_num_token_ids=None,
4848
):
4949
inputs = []
5050

@@ -76,9 +76,9 @@ def _get_inputs(
7676
np.array([return_cumulative_logprob], dtype=bool)
7777
)
7878

79-
if return_token_ids is not None:
80-
inputs.append(grpcclient.InferInput("return_token_ids", [1], "BOOL"))
81-
inputs[-1].set_data_from_numpy(np.array([return_token_ids], dtype=bool))
79+
if return_num_token_ids is not None:
80+
inputs.append(grpcclient.InferInput("return_num_token_ids", [1], "BOOL"))
81+
inputs[-1].set_data_from_numpy(np.array([return_num_token_ids], dtype=bool))
8282

8383
return inputs
8484

@@ -131,15 +131,15 @@ def _assert_cumulative_logprob(self, return_cumulative_logprob):
131131
assert cumulative_logprob != prev_cumulative_logprob
132132
prev_cumulative_logprob = cumulative_logprob
133133

134-
def _assert_token_ids(self, return_token_ids):
134+
def _assert_num_token_ids(self, return_num_token_ids):
135135
for response in self._responses:
136136
result, error = response["result"], response["error"]
137137
assert error is None
138-
token_ids_np = result.as_numpy(name="token_ids")
139-
if return_token_ids is None or return_token_ids == False:
140-
assert token_ids_np is None
138+
num_token_ids_np = result.as_numpy(name="num_token_ids")
139+
if return_num_token_ids is None or return_num_token_ids == False:
140+
assert num_token_ids_np is None
141141
continue
142-
token_ids = token_ids_np[0].astype(int)
142+
num_token_ids = num_token_ids_np[0].astype(int)
143143
# TODO: vLLM may return token ids identical to the previous one when
144144
# streaming, for example:
145145
#
@@ -155,31 +155,31 @@ def _assert_token_ids(self, return_token_ids):
155155
# prev: text=' the term', token_ids=array('l', [5, 1385, 44, 48])
156156
# curr: text=' the term “', token_ids=array('l', [5, 1385, 44, 48])
157157
#
158-
# If this is no longer the case in a future release, change to
159-
# assert len(token_ids) > 0.
160-
assert len(token_ids) >= 0
158+
# If this is no longer the case in a future release, change the assert
159+
# to assert num_token_ids > 0.
160+
assert num_token_ids >= 0
161161

162162
@pytest.mark.parametrize("stream", [True, False])
163163
@pytest.mark.parametrize("return_finish_reason", [None, True, False])
164164
@pytest.mark.parametrize("return_cumulative_logprob", [None, True, False])
165-
@pytest.mark.parametrize("return_token_ids", [None, True, False])
165+
@pytest.mark.parametrize("return_num_token_ids", [None, True, False])
166166
def test_additional_outputs(
167167
self,
168168
stream,
169169
return_finish_reason,
170170
return_cumulative_logprob,
171-
return_token_ids,
171+
return_num_token_ids,
172172
):
173173
inputs = self._get_inputs(
174174
self._prompt,
175175
stream=stream,
176176
sampling_parameters=self._sampling_parameters,
177177
return_finish_reason=return_finish_reason,
178178
return_cumulative_logprob=return_cumulative_logprob,
179-
return_token_ids=return_token_ids,
179+
return_num_token_ids=return_num_token_ids,
180180
)
181181
self._llm_infer(inputs)
182182
self._assert_text_output_valid()
183183
self._assert_finish_reason(return_finish_reason)
184184
self._assert_cumulative_logprob(return_cumulative_logprob)
185-
self._assert_token_ids(return_token_ids)
185+
self._assert_num_token_ids(return_num_token_ids)

docs/additional_outputs.md

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,14 +59,17 @@ point value will be sent on the `cumulative_logprob` output tensor.
5959

6060
Supported since r24.11.
6161

62-
### Token IDs
62+
### Number of token IDs
6363

64-
The token IDs of the generated output text sent on this response. See
64+
The number of token IDs of the generated output text sent on this response. It
65+
is the difference in length of the token IDs generated from the last response to
66+
this response. If this is the first response, the last response length is
67+
presumed to be zero. See
6568
[here](https://github.com/vllm-project/vllm/blob/v0.6.3.post1/vllm/outputs.py#L21)
66-
for more details.
69+
for more details on the token IDs of the generated output text.
6770

68-
To enable, set `return_token_ids` input tensor to `True`. The array of integer
69-
value will be sent on the `token_ids` output tensor.
71+
To enable, set `return_num_token_ids` input tensor to `True`. The unsigned
72+
integer value will be sent on the `num_token_ids` output tensor.
7073

7174
Supported since r24.11.
7275

src/model.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def _auto_complete_inputs_and_outputs(auto_complete_model_config):
101101
"optional": True,
102102
},
103103
{
104-
"name": "return_token_ids",
104+
"name": "return_num_token_ids",
105105
"data_type": "TYPE_BOOL",
106106
"dims": [1],
107107
"optional": True,
@@ -111,7 +111,7 @@ def _auto_complete_inputs_and_outputs(auto_complete_model_config):
111111
{"name": "text_output", "data_type": "TYPE_STRING", "dims": [-1]},
112112
{"name": "finish_reason", "data_type": "TYPE_STRING", "dims": [-1]},
113113
{"name": "cumulative_logprob", "data_type": "TYPE_FP32", "dims": [-1]},
114-
{"name": "token_ids", "data_type": "TYPE_INT64", "dims": [-1, -1]},
114+
{"name": "num_token_ids", "data_type": "TYPE_UINT32", "dims": [-1]},
115115
]
116116

117117
# Collect input and output names from the provided model config.
@@ -348,11 +348,11 @@ def _get_input_tensors(self, request):
348348
else:
349349
parameters = request.parameters()
350350

351-
# return_finish_reason, return_cumulative_logprob, return_token_ids
351+
# return_finish_reason, return_cumulative_logprob, return_num_token_ids
352352
additional_outputs = {
353353
"return_finish_reason": None,
354354
"return_cumulative_logprob": None,
355-
"return_token_ids": None,
355+
"return_num_token_ids": None,
356356
}
357357
for tensor_name in additional_outputs.keys():
358358
tensor = pb_utils.get_input_tensor_by_name(request, tensor_name)
@@ -467,8 +467,8 @@ def _create_response(
467467
)
468468
)
469469

470-
# token_ids
471-
if additional_outputs["return_token_ids"]:
470+
# num_token_ids
471+
if additional_outputs["return_num_token_ids"]:
472472
if prev_request_output is None:
473473
# this is the first response
474474
prev_lens = [0] * len(request_output.outputs)
@@ -478,12 +478,14 @@ def _create_response(
478478
len(prev_output.token_ids)
479479
for prev_output in prev_request_output.outputs
480480
]
481-
token_ids = [
482-
output.token_ids[prev_len:]
481+
num_token_ids = [
482+
(len(output.token_ids) - prev_len)
483483
for output, prev_len in zip(request_output.outputs, prev_lens)
484484
]
485485
output_tensors.append(
486-
pb_utils.Tensor("token_ids", np.asarray(token_ids, dtype=np.int64))
486+
pb_utils.Tensor(
487+
"num_token_ids", np.asarray(num_token_ids, dtype=np.uint32)
488+
)
487489
)
488490

489491
return pb_utils.InferenceResponse(output_tensors=output_tensors)

0 commit comments

Comments
 (0)