File tree Expand file tree Collapse file tree 2 files changed +6
-1
lines changed Expand file tree Collapse file tree 2 files changed +6
-1
lines changed Original file line number Diff line number Diff line change @@ -324,7 +324,7 @@ async def async_request_openai_completions(
324
324
325
325
most_recent_timestamp = timestamp
326
326
generated_text += text or ""
327
- elif usage := data .get ("usage" ):
327
+ if usage := data .get ("usage" ):
328
328
output .output_tokens = usage .get ("completion_tokens" )
329
329
if first_chunk_received :
330
330
output .success = True
@@ -611,6 +611,7 @@ def get_tokenizer(
611
611
"tensorrt-llm" : async_request_trt_llm ,
612
612
"scalellm" : async_request_openai_completions ,
613
613
"sglang" : async_request_openai_completions ,
614
+ "llama.cpp" : async_request_openai_completions ,
614
615
}
615
616
616
617
OPENAI_COMPATIBLE_BACKENDS = [
Original file line number Diff line number Diff line change @@ -762,6 +762,10 @@ def main(args: argparse.Namespace):
762
762
if "temperature" not in sampling_params :
763
763
sampling_params ["temperature" ] = 0.0 # Default to greedy decoding.
764
764
765
+ if args .backend == "llama.cpp" :
766
+ # Disable prompt caching in llama.cpp backend
767
+ sampling_params ["cache_prompt" ] = False
768
+
765
769
# Avoid GC processing "static" data - reduce pause times.
766
770
gc .collect ()
767
771
gc .freeze ()
You can’t perform that action at this time.
0 commit comments