Skip to content

Commit 83f26c7

Browse files
committed
Remove the temporary fix for gpt-4o, which was actually just messing up the costs calculation
1 parent 4a223b1 commit 83f26c7

File tree

5 files changed

+6
-18
lines changed

5 files changed

+6
-18
lines changed

evaluators/langevals/langevals_langevals/competitor_llm.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -153,9 +153,6 @@ def evaluate(self, entry: CompetitorLLMEntry) -> SingleEvaluationResult:
153153
passed = not arguments["competitor_mentioned"] if "competitor_mentioned" in arguments else True
154154
confidence = arguments["confidence"] if "confidence" in arguments else 1
155155
reasoning = arguments["reasoning"] if "reasoning" in arguments else "No reasoning."
156-
# Temporary fix for gpt-4o
157-
if "gpt-4o" in (response.model or ""):
158-
response.model = "openai/gpt-4-turbo"
159156
cost = completion_cost(completion_response=response, prompt=prompt)
160157
details = None
161158
if not passed:

evaluators/langevals/langevals_langevals/competitor_llm_function_call.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ class CompetitorLLMFunctionCallSettings(LLMEvaluatorSettings):
3434
default=["OpenAI", "Google", "Microsoft"],
3535
description="The competitors that must not be mentioned.",
3636
)
37-
37+
3838

3939

4040
class CompetitorLLMFunctionCallResult(EvaluationResult):
@@ -77,10 +77,10 @@ def evaluate(self, entry: CompetitorLLMFunctionCallEntry) -> SingleEvaluationRes
7777
for competitor in self.settings.competitors:
7878
competitors += competitor + "\n"
7979
litellm_model = model if vendor == "openai" and model != "gpt-4o" else f"{vendor}/{model}"
80-
prompt = f"""Remember that you are an advanced competitor detection system, developed by {your_company_description}.
81-
Your task is to identify mentions of competitors in any given message.
80+
prompt = f"""Remember that you are an advanced competitor detection system, developed by {your_company_description}.
81+
Your task is to identify mentions of competitors in any given message.
8282
The competitors specialize in the same field as your company and are listed below:
83-
83+
8484
Competitors:
8585
{competitors}
8686
@@ -158,9 +158,6 @@ def evaluate(self, entry: CompetitorLLMFunctionCallEntry) -> SingleEvaluationRes
158158
confidence = arguments["confidence"] if "confidence" in arguments else 1
159159
reasoning = arguments["reasoning"] if "reasoning" in arguments else "No reasoning."
160160
print(reasoning)
161-
# Temporary fix for gpt-4o
162-
if "gpt-4o" in (response.model or ""):
163-
response.model = "openai/gpt-4-turbo"
164161
cost = completion_cost(completion_response=response, prompt=prompt)
165162
details = None
166163
if not passed:

evaluators/langevals/langevals_langevals/llm_boolean.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -130,9 +130,6 @@ def evaluate(self, entry: CustomLLMBooleanEntry) -> SingleEvaluationResult:
130130
arguments = json.loads(
131131
cast(Message, choice.message).tool_calls[0].function.arguments
132132
)
133-
# Temporary fix for gpt-4o
134-
if "gpt-4o" in (response.model or ""):
135-
response.model = "openai/gpt-4-turbo"
136133
cost = completion_cost(completion_response=response)
137134

138135
return CustomLLMBooleanResult(

evaluators/langevals/langevals_langevals/llm_score.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -128,9 +128,6 @@ def evaluate(self, entry: CustomLLMScoreEntry) -> SingleEvaluationResult:
128128
arguments = json.loads(
129129
cast(Message, choice.message).tool_calls[0].function.arguments
130130
)
131-
# Temporary fix for gpt-4o
132-
if "gpt-4o" in (response.model or ""):
133-
response.model = "openai/gpt-4-turbo"
134131
cost = completion_cost(completion_response=response)
135132

136133
return CustomLLMScoreResult(

evaluators/langevals/tests/test_llm_boolean.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def test_custom_llm_boolean_evaluator():
1616
contexts=["London is the capital of France."],
1717
)
1818
settings = CustomLLMBooleanSettings(
19-
model="openai/gpt-3.5-turbo-0125",
19+
model="openai/gpt-4o-mini",
2020
prompt="You are an LLM evaluator. We need the guarantee that the output is using the provided context and not it's own brain, please evaluate as False if is not.",
2121
)
2222

@@ -37,7 +37,7 @@ def test_custom_llm_boolean_evaluator_skips_if_context_is_too_large():
3737
contexts=["London is the capital of France."] * 300,
3838
)
3939
settings = CustomLLMBooleanSettings(
40-
model="openai/gpt-3.5-turbo-0125",
40+
model="openai/gpt-4o-mini",
4141
prompt="You are an LLM evaluator. We need the guarantee that the output is using the provided context and not it's own brain, please evaluate as False if is not.",
4242
max_tokens=2048,
4343
)

0 commit comments

Comments
 (0)