Skip to content

Commit 304a3c6

Browse files
authored
feat: add latency tracking and enhanced token usage details (#665)
## Summary Added latency tracking and enhanced token usage details for LLM responses, providing more granular metrics for performance monitoring and cost analysis. ## Changes - Added latency tracking to Vertex provider responses, measuring and reporting API call duration in milliseconds - Enhanced `BifrostLLMUsage` structure with detailed token usage fields for both prompt and completion tokens - Added support for specialized token types like cached tokens, reasoning tokens, audio tokens, and prediction tokens - Implemented conversion methods between different response formats to preserve token usage details - Updated UI to display detailed token usage information when available - Added support for vertex provider/model format in pricing lookup - Removed commented-out code related to response pooling in Vertex provider ## Type of change - [x] Feature - [x] Refactor ## Affected areas - [x] Core (Go) - [ ] Transports (HTTP) - [x] Providers/Integrations - [x] Plugins - [x] UI (Next.js) - [ ] Docs ## How to test Test the latency tracking and enhanced token usage details with various providers: ```sh # Core/Transports go version go test ./... # UI cd ui pnpm i pnpm test pnpm build ``` Verify that latency is reported in milliseconds in the response and that detailed token usage information is displayed in the UI when available. ## Breaking changes - [ ] Yes - [x] No ## Related issues N/A ## Security considerations No security implications as this PR only enhances metrics reporting. ## Checklist - [x] I read `docs/contributing/README.md` and followed the guidelines - [x] I added/updated tests where appropriate - [x] I updated documentation where needed - [x] I verified builds succeed (Go and UI) - [x] I verified the CI pipeline passes locally if applicable
2 parents eb5e9e8 + 8fd0545 commit 304a3c6

File tree

17 files changed

+273
-84
lines changed

17 files changed

+273
-84
lines changed

core/providers/gemini.go

Lines changed: 35 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"fmt"
1111
"io"
1212
"net/http"
13+
"strconv"
1314
"strings"
1415
"time"
1516

@@ -100,7 +101,7 @@ func (provider *GeminiProvider) ChatCompletion(ctx context.Context, key schemas.
100101
jsonBody, err := sonic.Marshal(reqBody)
101102
if err != nil {
102103
return nil, newBifrostOperationError(schemas.ErrProviderJSONMarshaling, err, providerName)
103-
}
104+
}
104105

105106
// Create request
106107
req := fasthttp.AcquireRequest()
@@ -126,19 +127,7 @@ func (provider *GeminiProvider) ChatCompletion(ctx context.Context, key schemas.
126127

127128
// Handle error response
128129
if resp.StatusCode() != fasthttp.StatusOK {
129-
var errorResp []struct {
130-
Error struct {
131-
Code int `json:"code"`
132-
Message string `json:"message"`
133-
Status string `json:"status"`
134-
Details []struct {
135-
Type string `json:"@type"`
136-
FieldViolations []struct {
137-
Description string `json:"description"`
138-
} `json:"fieldViolations"`
139-
} `json:"details"`
140-
} `json:"error"`
141-
}
130+
var errorResp []gemini.GeminiGenerationError
142131

143132
bifrostErr := handleProviderAPIError(resp, &errorResp)
144133
errorMessage := ""
@@ -907,29 +896,50 @@ func parseStreamGeminiError(providerName schemas.ModelProvider, resp *http.Respo
907896
}
908897

909898
// Try to parse as JSON first
910-
var errorResp map[string]interface{}
899+
var errorResp gemini.GeminiGenerationError
911900
if err := sonic.Unmarshal(body, &errorResp); err == nil {
912-
// Successfully parsed as JSON
913-
return newBifrostOperationError(fmt.Sprintf("Gemini streaming error: %v", errorResp), fmt.Errorf("HTTP %d", resp.StatusCode), providerName)
901+
bifrostErr := &schemas.BifrostError{
902+
IsBifrostError: false,
903+
StatusCode: schemas.Ptr(resp.StatusCode),
904+
Error: &schemas.ErrorField{
905+
Code: schemas.Ptr(strconv.Itoa(errorResp.Error.Code)),
906+
Message: errorResp.Error.Message,
907+
},
908+
}
909+
return bifrostErr
914910
}
915911

916-
// If JSON parsing fails, treat as plain text
917-
bodyStr := string(body)
918-
if bodyStr == "" {
919-
bodyStr = "empty response body"
912+
// If JSON parsing fails, use the raw response body
913+
var rawResponse interface{}
914+
if err := sonic.Unmarshal(body, &rawResponse); err != nil {
915+
return newBifrostOperationError(schemas.ErrProviderResponseUnmarshal, err, providerName)
920916
}
921917

922-
return newBifrostOperationError(fmt.Sprintf("Gemini streaming error (HTTP %d): %s", resp.StatusCode, bodyStr), fmt.Errorf("HTTP %d", resp.StatusCode), providerName)
918+
return newBifrostOperationError(fmt.Sprintf("Gemini streaming error (HTTP %d): %v", resp.StatusCode, rawResponse), fmt.Errorf("HTTP %d", resp.StatusCode), providerName)
923919
}
924920

925921
// parseGeminiError parses Gemini error responses
926922
func parseGeminiError(providerName schemas.ModelProvider, resp *fasthttp.Response) *schemas.BifrostError {
927-
var errorResp map[string]interface{}
928923
body := resp.Body()
929924

930-
if err := sonic.Unmarshal(body, &errorResp); err != nil {
925+
// Try to parse as JSON first
926+
var errorResp gemini.GeminiGenerationError
927+
if err := sonic.Unmarshal(body, &errorResp); err == nil {
928+
bifrostErr := &schemas.BifrostError{
929+
IsBifrostError: false,
930+
StatusCode: schemas.Ptr(resp.StatusCode()),
931+
Error: &schemas.ErrorField{
932+
Code: schemas.Ptr(strconv.Itoa(errorResp.Error.Code)),
933+
Message: errorResp.Error.Message,
934+
},
935+
}
936+
return bifrostErr
937+
}
938+
939+
var rawResponse map[string]interface{}
940+
if err := sonic.Unmarshal(body, &rawResponse); err != nil {
931941
return newBifrostOperationError("failed to parse error response", err, providerName)
932942
}
933943

934-
return newBifrostOperationError(fmt.Sprintf("Gemini error: %v", errorResp), fmt.Errorf("HTTP %d", resp.StatusCode()), providerName)
944+
return newBifrostOperationError(fmt.Sprintf("Gemini error: %v", rawResponse), fmt.Errorf("HTTP %d", resp.StatusCode()), providerName)
935945
}

core/providers/vertex.go

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"net/http"
1414
"strings"
1515
"sync"
16+
"time"
1617

1718
"golang.org/x/oauth2/google"
1819

@@ -245,6 +246,8 @@ func (provider *VertexProvider) ChatCompletion(ctx context.Context, key schemas.
245246
return nil, newBifrostOperationError("error creating auth client", err, schemas.Vertex)
246247
}
247248

249+
startTime := time.Now()
250+
248251
// Make request
249252
resp, err := client.Do(req)
250253
if err != nil {
@@ -267,6 +270,8 @@ func (provider *VertexProvider) ChatCompletion(ctx context.Context, key schemas.
267270
}
268271
defer resp.Body.Close()
269272

273+
latency := time.Since(startTime)
274+
270275
// Handle error response
271276
// Read response body
272277
body, err := io.ReadAll(resp.Body)
@@ -314,6 +319,7 @@ func (provider *VertexProvider) ChatCompletion(ctx context.Context, key schemas.
314319
RequestType: schemas.ChatCompletionRequest,
315320
Provider: schemas.Vertex,
316321
ModelRequested: request.Model,
322+
Latency: latency.Milliseconds(),
317323
}
318324

319325
if provider.sendBackRawResponse {
@@ -322,10 +328,7 @@ func (provider *VertexProvider) ChatCompletion(ctx context.Context, key schemas.
322328

323329
return response, nil
324330
} else {
325-
// Pre-allocate response structs from pools
326-
// response := acquireOpenAIResponse()
327331
response := &schemas.BifrostChatResponse{}
328-
// defer releaseOpenAIResponse(response)
329332

330333
// Use enhanced response handler with pre-allocated response
331334
rawResponse, bifrostErr := handleProviderResponse(body, response, provider.sendBackRawResponse)
@@ -336,6 +339,7 @@ func (provider *VertexProvider) ChatCompletion(ctx context.Context, key schemas.
336339
response.ExtraFields.RequestType = schemas.ChatCompletionRequest
337340
response.ExtraFields.Provider = schemas.Vertex
338341
response.ExtraFields.ModelRequested = request.Model
342+
response.ExtraFields.Latency = latency.Milliseconds()
339343

340344
if provider.sendBackRawResponse {
341345
response.ExtraFields.RawResponse = rawResponse
@@ -484,22 +488,15 @@ func (provider *VertexProvider) Embedding(ctx context.Context, key schemas.Key,
484488
return nil, newConfigurationError("embedding input texts are empty", schemas.Vertex)
485489
}
486490

487-
// All Vertex AI embedding models use the same native Vertex embedding API
488-
return provider.handleVertexEmbedding(ctx, request.Model, key, reqBody, request.Params)
489-
}
490-
491-
// handleVertexEmbedding handles embedding requests using Vertex's native embedding API
492-
// This is used for all Vertex AI embedding models as they all use the same response format
493-
func (provider *VertexProvider) handleVertexEmbedding(ctx context.Context, model string, key schemas.Key, vertexReq *vertex.VertexEmbeddingRequest, params *schemas.EmbeddingParameters) (*schemas.BifrostEmbeddingResponse, *schemas.BifrostError) {
494491
// Use the typed request directly
495-
jsonBody, err := sonic.Marshal(vertexReq)
492+
jsonBody, err := sonic.Marshal(reqBody)
496493
if err != nil {
497494
return nil, newBifrostOperationError(schemas.ErrProviderJSONMarshaling, err, schemas.Vertex)
498495
}
499496

500497
// Build the native Vertex embedding API endpoint
501498
url := fmt.Sprintf("https://%s-aiplatform.googleapis.com/v1/projects/%s/locations/%s/publishers/google/models/%s:predict",
502-
key.VertexKeyConfig.Region, key.VertexKeyConfig.ProjectID, key.VertexKeyConfig.Region, model)
499+
key.VertexKeyConfig.Region, key.VertexKeyConfig.ProjectID, key.VertexKeyConfig.Region, request.Model)
503500

504501
// Create request
505502
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(jsonBody))
@@ -532,6 +529,8 @@ func (provider *VertexProvider) handleVertexEmbedding(ctx context.Context, model
532529
return nil, newBifrostOperationError("error creating auth client", err, schemas.Vertex)
533530
}
534531

532+
startTime := time.Now()
533+
535534
// Make request
536535
resp, err := client.Do(req)
537536
if err != nil {
@@ -554,6 +553,8 @@ func (provider *VertexProvider) handleVertexEmbedding(ctx context.Context, model
554553
}
555554
defer resp.Body.Close()
556555

556+
latency := time.Since(startTime)
557+
557558
// Handle error response
558559
body, err := io.ReadAll(resp.Body)
559560
if err != nil {
@@ -598,8 +599,9 @@ func (provider *VertexProvider) handleVertexEmbedding(ctx context.Context, model
598599

599600
// Set ExtraFields
600601
bifrostResponse.ExtraFields.Provider = schemas.Vertex
601-
bifrostResponse.ExtraFields.ModelRequested = model
602+
bifrostResponse.ExtraFields.ModelRequested = request.Model
602603
bifrostResponse.ExtraFields.RequestType = schemas.EmbeddingRequest
604+
bifrostResponse.ExtraFields.Latency = latency.Milliseconds()
603605

604606
// Set raw response if enabled
605607
if provider.sendBackRawResponse {

core/schemas/chatcompletions.go

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -545,7 +545,21 @@ type ContentLogProb struct {
545545

546546
// BifrostLLMUsage represents token usage information
547547
type BifrostLLMUsage struct {
548-
PromptTokens int `json:"prompt_tokens,omitempty"`
549-
CompletionTokens int `json:"completion_tokens,omitempty"`
550-
TotalTokens int `json:"total_tokens"`
548+
PromptTokens int `json:"prompt_tokens,omitempty"`
549+
PromptTokensDetails *ChatPromptTokensDetails `json:"prompt_tokens_details,omitempty"`
550+
CompletionTokens int `json:"completion_tokens,omitempty"`
551+
CompletionTokensDetails *ChatCompletionTokensDetails `json:"completion_tokens_details,omitempty"`
552+
TotalTokens int `json:"total_tokens"`
553+
}
554+
555+
type ChatPromptTokensDetails struct {
556+
AudioTokens int `json:"audio_tokens,omitempty"`
557+
CachedTokens int `json:"cached_tokens,omitempty"`
558+
}
559+
560+
type ChatCompletionTokensDetails struct {
561+
AcceptedPredictionTokens int `json:"accepted_prediction_tokens,omitempty"`
562+
AudioTokens int `json:"audio_tokens,omitempty"`
563+
ReasoningTokens int `json:"reasoning_tokens,omitempty"`
564+
RejectedPredictionTokens int `json:"rejected_prediction_tokens,omitempty"`
551565
}

core/schemas/mux.go

Lines changed: 61 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -615,6 +615,64 @@ func ToChatMessages(rms []ResponsesMessage) []ChatMessage {
615615
return chatMessages
616616
}
617617

618+
func (cu *BifrostLLMUsage) ToResponsesResponseUsage() *ResponsesResponseUsage {
619+
if cu == nil {
620+
return nil
621+
}
622+
623+
usage := &ResponsesResponseUsage{
624+
InputTokens: cu.PromptTokens,
625+
OutputTokens: cu.CompletionTokens,
626+
TotalTokens: cu.TotalTokens,
627+
}
628+
629+
if cu.PromptTokensDetails != nil {
630+
usage.InputTokensDetails = &ResponsesResponseInputTokens{
631+
AudioTokens: cu.PromptTokensDetails.AudioTokens,
632+
CachedTokens: cu.PromptTokensDetails.CachedTokens,
633+
}
634+
}
635+
if cu.CompletionTokensDetails != nil {
636+
usage.OutputTokensDetails = &ResponsesResponseOutputTokens{
637+
AcceptedPredictionTokens: cu.CompletionTokensDetails.AcceptedPredictionTokens,
638+
AudioTokens: cu.CompletionTokensDetails.AudioTokens,
639+
ReasoningTokens: cu.CompletionTokensDetails.ReasoningTokens,
640+
RejectedPredictionTokens: cu.CompletionTokensDetails.RejectedPredictionTokens,
641+
}
642+
}
643+
644+
return usage
645+
}
646+
647+
func (ru *ResponsesResponseUsage) ToBifrostLLMUsage() *BifrostLLMUsage {
648+
if ru == nil {
649+
return nil
650+
}
651+
652+
usage := &BifrostLLMUsage{
653+
PromptTokens: ru.InputTokens,
654+
CompletionTokens: ru.OutputTokens,
655+
TotalTokens: ru.TotalTokens,
656+
}
657+
658+
if ru.InputTokensDetails != nil {
659+
usage.PromptTokensDetails = &ChatPromptTokensDetails{
660+
AudioTokens: ru.InputTokensDetails.AudioTokens,
661+
CachedTokens: ru.InputTokensDetails.CachedTokens,
662+
}
663+
}
664+
if ru.OutputTokensDetails != nil {
665+
usage.CompletionTokensDetails = &ChatCompletionTokensDetails{
666+
AcceptedPredictionTokens: ru.OutputTokensDetails.AcceptedPredictionTokens,
667+
AudioTokens: ru.OutputTokensDetails.AudioTokens,
668+
ReasoningTokens: ru.OutputTokensDetails.ReasoningTokens,
669+
RejectedPredictionTokens: ru.OutputTokensDetails.RejectedPredictionTokens,
670+
}
671+
}
672+
673+
return usage
674+
}
675+
618676
// =============================================================================
619677
// REQUEST CONVERSION METHODS
620678
// =============================================================================
@@ -805,15 +863,7 @@ func (cr *BifrostChatResponse) ToBifrostResponsesResponse() *BifrostResponsesRes
805863

806864
// Convert Usage if needed
807865
if cr.Usage != nil {
808-
responsesResp.Usage = &ResponsesResponseUsage{
809-
InputTokens: cr.Usage.PromptTokens,
810-
OutputTokens: cr.Usage.CompletionTokens,
811-
TotalTokens: cr.Usage.TotalTokens,
812-
}
813-
814-
if responsesResp.Usage.TotalTokens == 0 {
815-
responsesResp.Usage.TotalTokens = cr.Usage.PromptTokens + cr.Usage.CompletionTokens
816-
}
866+
responsesResp.Usage = cr.Usage.ToResponsesResponseUsage()
817867
}
818868

819869
// Copy other relevant fields
@@ -859,15 +909,7 @@ func (responsesResp *BifrostResponsesResponse) ToBifrostChatResponse() *BifrostC
859909
// Convert Usage if needed
860910
if responsesResp.Usage != nil {
861911
// Map Responses usage to Chat usage
862-
chatResp.Usage = &BifrostLLMUsage{
863-
PromptTokens: responsesResp.Usage.InputTokens,
864-
CompletionTokens: responsesResp.Usage.OutputTokens,
865-
TotalTokens: responsesResp.Usage.TotalTokens,
866-
}
867-
868-
if chatResp.Usage.TotalTokens == 0 {
869-
chatResp.Usage.TotalTokens = chatResp.Usage.PromptTokens + chatResp.Usage.CompletionTokens
870-
}
912+
chatResp.Usage = responsesResp.Usage.ToBifrostLLMUsage()
871913
}
872914

873915
// Copy other relevant fields
@@ -976,11 +1018,7 @@ func (cr *BifrostChatResponse) ToBifrostResponsesStreamResponse() *BifrostRespon
9761018
// Add usage information if present in the response
9771019
if cr.Usage != nil {
9781020
streamResp.Response = &BifrostResponsesResponse{
979-
Usage: &ResponsesResponseUsage{
980-
InputTokens: cr.Usage.PromptTokens,
981-
OutputTokens: cr.Usage.CompletionTokens,
982-
TotalTokens: cr.Usage.TotalTokens,
983-
},
1021+
Usage: cr.Usage.ToResponsesResponseUsage(),
9841022
}
9851023
}
9861024
} else {

core/schemas/providers/anthropic/chat.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"encoding/json"
55
"fmt"
66
"time"
7+
78
"github.com/maximhq/bifrost/core/schemas"
89
)
910

@@ -350,7 +351,10 @@ func (response *AnthropicMessageResponse) ToBifrostChatResponse() *schemas.Bifro
350351
// Convert usage information
351352
if response.Usage != nil {
352353
bifrostResponse.Usage = &schemas.BifrostLLMUsage{
353-
PromptTokens: response.Usage.InputTokens,
354+
PromptTokens: response.Usage.InputTokens,
355+
PromptTokensDetails: &schemas.ChatPromptTokensDetails{
356+
CachedTokens: response.Usage.CacheCreationInputTokens + response.Usage.CacheReadInputTokens,
357+
},
354358
CompletionTokens: response.Usage.OutputTokens,
355359
TotalTokens: response.Usage.InputTokens + response.Usage.OutputTokens,
356360
}
@@ -613,6 +617,11 @@ func ToAnthropicChatCompletionResponse(bifrostResp *schemas.BifrostChatResponse)
613617
InputTokens: bifrostResp.Usage.PromptTokens,
614618
OutputTokens: bifrostResp.Usage.CompletionTokens,
615619
}
620+
621+
//NOTE: We cannot segregate between cache creation and cache read tokens, so we will use the total cached tokens as the cache read tokens
622+
if bifrostResp.Usage.PromptTokensDetails != nil && bifrostResp.Usage.PromptTokensDetails.CachedTokens > 0 {
623+
anthropicResp.Usage.CacheReadInputTokens = bifrostResp.Usage.PromptTokensDetails.CachedTokens
624+
}
616625
}
617626

618627
// Convert choices to content

core/schemas/providers/cohere/chat.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,11 @@ func (response *CohereChatResponse) ToBifrostChatResponse() *schemas.BifrostChat
298298
if response.Usage.Tokens.OutputTokens != nil {
299299
usage.CompletionTokens = int(*response.Usage.Tokens.OutputTokens)
300300
}
301+
if response.Usage.CachedTokens != nil {
302+
usage.PromptTokensDetails = &schemas.ChatPromptTokensDetails{
303+
CachedTokens: int(*response.Usage.CachedTokens),
304+
}
305+
}
301306
usage.TotalTokens = usage.PromptTokens + usage.CompletionTokens
302307
}
303308

0 commit comments

Comments
 (0)