Remove the temporary fix for gpt-4o, which was actually just messing up the costs calculation

rogeriochaves · rogeriochaves · commit 83f26c79db67 · 2024-08-03T09:01:09.000+02:00
diff --git a/evaluators/langevals/langevals_langevals/competitor_llm.py b/evaluators/langevals/langevals_langevals/competitor_llm.py
@@ -153,9 +153,6 @@ def evaluate(self, entry: CompetitorLLMEntry) -> SingleEvaluationResult:
         passed = not arguments["competitor_mentioned"] if "competitor_mentioned" in arguments else True
         confidence = arguments["confidence"] if "confidence" in arguments else 1
         reasoning = arguments["reasoning"] if "reasoning" in arguments else "No reasoning."
-        # Temporary fix for gpt-4o
-        if "gpt-4o" in (response.model or ""):
-            response.model = "openai/gpt-4-turbo"
         cost = completion_cost(completion_response=response, prompt=prompt)
         details = None
         if not passed:
diff --git a/evaluators/langevals/langevals_langevals/competitor_llm_function_call.py b/evaluators/langevals/langevals_langevals/competitor_llm_function_call.py
@@ -34,7 +34,7 @@ class CompetitorLLMFunctionCallSettings(LLMEvaluatorSettings):
         default=["OpenAI", "Google", "Microsoft"],
         description="The competitors that must not be mentioned.",
     )
-    
+
 
 
 class CompetitorLLMFunctionCallResult(EvaluationResult):
@@ -77,10 +77,10 @@ def evaluate(self, entry: CompetitorLLMFunctionCallEntry) -> SingleEvaluationRes
         for competitor in self.settings.competitors:
             competitors += competitor + "\n"
         litellm_model = model if vendor == "openai" and model != "gpt-4o" else f"{vendor}/{model}"
-        prompt = f"""Remember that you are an advanced competitor detection system, developed by {your_company_description}. 
-                    Your task is to identify mentions of competitors in any given message. 
+        prompt = f"""Remember that you are an advanced competitor detection system, developed by {your_company_description}.
+                    Your task is to identify mentions of competitors in any given message.
                     The competitors specialize in the same field as your company and are listed below:
-                    
+
                     Competitors:
                     {competitors}
 
@@ -158,9 +158,6 @@ def evaluate(self, entry: CompetitorLLMFunctionCallEntry) -> SingleEvaluationRes
         confidence = arguments["confidence"] if "confidence" in arguments else 1
         reasoning = arguments["reasoning"] if "reasoning" in arguments else "No reasoning."
         print(reasoning)
-        # Temporary fix for gpt-4o
-        if "gpt-4o" in (response.model or ""):
-            response.model = "openai/gpt-4-turbo"
         cost = completion_cost(completion_response=response, prompt=prompt)
         details = None
         if not passed:
diff --git a/evaluators/langevals/langevals_langevals/llm_boolean.py b/evaluators/langevals/langevals_langevals/llm_boolean.py
@@ -130,9 +130,6 @@ def evaluate(self, entry: CustomLLMBooleanEntry) -> SingleEvaluationResult:
         arguments = json.loads(
             cast(Message, choice.message).tool_calls[0].function.arguments
         )
-        # Temporary fix for gpt-4o
-        if "gpt-4o" in (response.model or ""):
-            response.model = "openai/gpt-4-turbo"
         cost = completion_cost(completion_response=response)
 
         return CustomLLMBooleanResult(
diff --git a/evaluators/langevals/langevals_langevals/llm_score.py b/evaluators/langevals/langevals_langevals/llm_score.py
@@ -128,9 +128,6 @@ def evaluate(self, entry: CustomLLMScoreEntry) -> SingleEvaluationResult:
         arguments = json.loads(
             cast(Message, choice.message).tool_calls[0].function.arguments
         )
-        # Temporary fix for gpt-4o
-        if "gpt-4o" in (response.model or ""):
-            response.model = "openai/gpt-4-turbo"
         cost = completion_cost(completion_response=response)
 
         return CustomLLMScoreResult(
diff --git a/evaluators/langevals/tests/test_llm_boolean.py b/evaluators/langevals/tests/test_llm_boolean.py
@@ -16,7 +16,7 @@ def test_custom_llm_boolean_evaluator():
         contexts=["London is the capital of France."],
     )
     settings = CustomLLMBooleanSettings(
-        model="openai/gpt-3.5-turbo-0125",
+        model="openai/gpt-4o-mini",
         prompt="You are an LLM evaluator. We need the guarantee that the output is using the provided context and not it's own brain, please evaluate as False if is not.",
     )
 
@@ -37,7 +37,7 @@ def test_custom_llm_boolean_evaluator_skips_if_context_is_too_large():
         contexts=["London is the capital of France."] * 300,
     )
     settings = CustomLLMBooleanSettings(
-        model="openai/gpt-3.5-turbo-0125",
+        model="openai/gpt-4o-mini",
         prompt="You are an LLM evaluator. We need the guarantee that the output is using the provided context and not it's own brain, please evaluate as False if is not.",
         max_tokens=2048,
     )

Original file line number	Diff line number	Diff line change
`@@ -130,9 +130,6 @@ def evaluate(self, entry: CustomLLMBooleanEntry) -> SingleEvaluationResult:`
`130`	`130`	`arguments = json.loads(`
`131`	`131`	`cast(Message, choice.message).tool_calls[0].function.arguments`
`132`	`132`	`)`
`133`		`- # Temporary fix for gpt-4o`
`134`		`- if "gpt-4o" in (response.model or ""):`
`135`		`- response.model = "openai/gpt-4-turbo"`
`136`	`133`	`cost = completion_cost(completion_response=response)`
`137`	`134`
`138`	`135`	`return CustomLLMBooleanResult(`
Original file line number	Diff line number	Diff line change
`@@ -128,9 +128,6 @@ def evaluate(self, entry: CustomLLMScoreEntry) -> SingleEvaluationResult:`
`128`	`128`	`arguments = json.loads(`
`129`	`129`	`cast(Message, choice.message).tool_calls[0].function.arguments`
`130`	`130`	`)`
`131`		`- # Temporary fix for gpt-4o`
`132`		`- if "gpt-4o" in (response.model or ""):`
`133`		`- response.model = "openai/gpt-4-turbo"`
`134`	`131`	`cost = completion_cost(completion_response=response)`
`135`	`132`
`136`	`133`	`return CustomLLMScoreResult(`
Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@ def test_custom_llm_boolean_evaluator():`
`16`	`16`	`contexts=["London is the capital of France."],`
`17`	`17`	`)`
`18`	`18`	`settings = CustomLLMBooleanSettings(`
`19`		`- model="openai/gpt-3.5-turbo-0125",`
	`19`	`+ model="openai/gpt-4o-mini",`
`20`	`20`	`prompt="You are an LLM evaluator. We need the guarantee that the output is using the provided context and not it's own brain, please evaluate as False if is not.",`
`21`	`21`	`)`
`22`	`22`
`@@ -37,7 +37,7 @@ def test_custom_llm_boolean_evaluator_skips_if_context_is_too_large():`
`37`	`37`	`contexts=["London is the capital of France."] * 300,`
`38`	`38`	`)`
`39`	`39`	`settings = CustomLLMBooleanSettings(`
`40`		`- model="openai/gpt-3.5-turbo-0125",`
	`40`	`+ model="openai/gpt-4o-mini",`
`41`	`41`	`prompt="You are an LLM evaluator. We need the guarantee that the output is using the provided context and not it's own brain, please evaluate as False if is not.",`
`42`	`42`	`max_tokens=2048,`
`43`	`43`	`)`