Skip to content

Commit b169d5f

Browse files
authored
[Misc][Tools][Benchmark] Add benchmark_serving supports for llama.cpp. (vllm-project#18692)
Signed-off-by: Duyi-Wang <duyi.wang@intel.com>
1 parent f8977c2 commit b169d5f

File tree

2 files changed

+6
-1
lines changed

2 files changed

+6
-1
lines changed

benchmarks/backend_request_func.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,7 @@ async def async_request_openai_completions(
324324

325325
most_recent_timestamp = timestamp
326326
generated_text += text or ""
327-
elif usage := data.get("usage"):
327+
if usage := data.get("usage"):
328328
output.output_tokens = usage.get("completion_tokens")
329329
if first_chunk_received:
330330
output.success = True
@@ -611,6 +611,7 @@ def get_tokenizer(
611611
"tensorrt-llm": async_request_trt_llm,
612612
"scalellm": async_request_openai_completions,
613613
"sglang": async_request_openai_completions,
614+
"llama.cpp": async_request_openai_completions,
614615
}
615616

616617
OPENAI_COMPATIBLE_BACKENDS = [

benchmarks/benchmark_serving.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -762,6 +762,10 @@ def main(args: argparse.Namespace):
762762
if "temperature" not in sampling_params:
763763
sampling_params["temperature"] = 0.0 # Default to greedy decoding.
764764

765+
if args.backend == "llama.cpp":
766+
# Disable prompt caching in llama.cpp backend
767+
sampling_params["cache_prompt"] = False
768+
765769
# Avoid GC processing "static" data - reduce pause times.
766770
gc.collect()
767771
gc.freeze()

0 commit comments

Comments
 (0)