feat: bump it all to gpt-5 as defaults

rogeriochaves · rogeriochaves · commit 135c74b8f0fd · 2025-09-07T10:09:19.000+02:00
diff --git a/evaluators/langevals/tests/test_llm_answer_match.py b/evaluators/langevals/tests/test_llm_answer_match.py
@@ -12,7 +12,7 @@ def test_llm_answer_match():
         expected_output="rock",
     )
     evaluator = LLMAnswerMatchEvaluator(
-        settings=LLMAnswerMatchSettings(model="openai/gpt-4o-mini")
+        settings=LLMAnswerMatchSettings(model="openai/gpt-5")
     )
     result = evaluator.evaluate(entry)
 
@@ -28,7 +28,7 @@ def test_llm_answer_match_without_question():
         expected_output="rock",
     )
     evaluator = LLMAnswerMatchEvaluator(
-        settings=LLMAnswerMatchSettings(model="openai/gpt-4o-mini")
+        settings=LLMAnswerMatchSettings(model="openai/gpt-5")
     )
     result = evaluator.evaluate(entry)
 
@@ -44,7 +44,7 @@ def test_llm_answer_does_not_match_match():
         expected_output="pop",
     )
     evaluator = LLMAnswerMatchEvaluator(
-        settings=LLMAnswerMatchSettings(model="openai/gpt-4o-mini")
+        settings=LLMAnswerMatchSettings(model="openai/gpt-5")
     )
     result = evaluator.evaluate(entry)
 
diff --git a/evaluators/langevals/tests/test_llm_boolean.py b/evaluators/langevals/tests/test_llm_boolean.py
@@ -17,7 +17,7 @@ def test_custom_llm_boolean_evaluator():
         contexts=["London is the capital of France."],
     )
     settings = CustomLLMBooleanSettings(
-        model="openai/gpt-4o-mini",
+        model="openai/gpt-5",
         prompt="You are an LLM evaluator. We need the guarantee that the output is using the provided context and not it's own brain, please evaluate as False if is not.",
     )
 
@@ -38,7 +38,7 @@ def test_custom_llm_boolean_evaluator_skips_if_context_is_too_large():
         contexts=["London is the capital of France."] * 300,
     )
     settings = CustomLLMBooleanSettings(
-        model="openai/gpt-4o-mini",
+        model="openai/gpt-5",
         prompt="You are an LLM evaluator. We need the guarantee that the output is using the provided context and not it's own brain, please evaluate as False if is not.",
         max_tokens=2048,
     )
diff --git a/evaluators/langevals/tests/test_llm_category.py b/evaluators/langevals/tests/test_llm_category.py
@@ -18,7 +18,7 @@ def test_custom_llm_category_evaluator():
         contexts=["London is the capital of France."],
     )
     settings = CustomLLMCategorySettings(
-        model="openai/gpt-4o-mini",
+        model="openai/gpt-5",
         prompt="You are an LLM category evaluator. Please categorize the answer in one of the following categories",
         categories=[
             CustomLLMCategoryDefinition(
@@ -48,7 +48,7 @@ def test_custom_llm_category_evaluator_skips_if_context_is_too_large():
         contexts=["London is the capital of France."] * 300,
     )
     settings = CustomLLMCategorySettings(
-        model="openai/gpt-4o-mini",
+        model="openai/gpt-5",
         prompt="You are an LLM category evaluator. Please categorize the answer in one of the following categories",
         categories=[
             CustomLLMCategoryDefinition(
diff --git a/evaluators/langevals/tests/test_off_topic.py b/evaluators/langevals/tests/test_off_topic.py
@@ -18,7 +18,7 @@ def test_off_topic_evaluator():
             AllowedTopic(topic="email_delete", description="Delete an email"),
             AllowedTopic(topic="email_write", description="Write an email"),
         ],
-        model="openai/gpt-4o-mini"
+        model="openai/gpt-5"
     )
     evaluator = OffTopicEvaluator(settings=settings)
     result = evaluator.evaluate(entry)
diff --git a/evaluators/langevals/tests/test_query_resolution.py b/evaluators/langevals/tests/test_query_resolution.py
@@ -18,7 +18,7 @@ def test_query_resolution_conversation_evaluator_pass_for_simple_greetings():
         output="Hello, I am an assistant and I don't have feelings",
     )
     conversation = QueryResolutionEntry(conversation=[response1])
-    settings = QueryResolutionSettings(model="openai/gpt-4o-mini", max_tokens=10000)
+    settings = QueryResolutionSettings(model="openai/gpt-5", max_tokens=10000)
     evaluator = QueryResolutionEvaluator(settings=settings)
     result = evaluator.evaluate(conversation)
 
@@ -37,7 +37,7 @@ def test_query_resolution_conversation_evaluator_pass():
         output="There is no president in the Netherlands. The system of government is constitutional monarchy.",
     )
     conversation = QueryResolutionEntry(conversation=[response1, response2])
-    settings = QueryResolutionSettings(model="openai/gpt-4o-mini", max_tokens=10000)
+    settings = QueryResolutionSettings(model="openai/gpt-5", max_tokens=10000)
     evaluator = QueryResolutionEvaluator(settings=settings)
     result = evaluator.evaluate(conversation)
 
@@ -57,7 +57,7 @@ def test_query_resolution_conversation_evaluator_fail():
         output="There is no president in the Netherlands.",
     )
     conversation = QueryResolutionEntry(conversation=[response1, response2])
-    settings = QueryResolutionSettings(model="openai/gpt-4o-mini", max_tokens=10000)
+    settings = QueryResolutionSettings(model="openai/gpt-5", max_tokens=10000)
     evaluator = QueryResolutionEvaluator(settings=settings)
     result = evaluator.evaluate(conversation)
 
@@ -73,7 +73,7 @@ def test_query_resolution_conversation_evaluator_fails_with_i_dont_know():
         output="Sorry, I don't have any information about the current time",
     )
     conversation = QueryResolutionEntry(conversation=[response1])
-    settings = QueryResolutionSettings(model="openai/gpt-4o-mini", max_tokens=10000)
+    settings = QueryResolutionSettings(model="openai/gpt-5", max_tokens=10000)
     evaluator = QueryResolutionEvaluator(settings=settings)
     result = evaluator.evaluate(conversation)
 
@@ -87,7 +87,7 @@ def test_product_sentiment_polarity_evaluator_skipped_for_non_product_related_ou
     response1 = ConversationEntry(input="", output="")
     response2 = ConversationEntry(input="", output="")
     conversation = QueryResolutionEntry(conversation=[response1, response2])
-    settings = QueryResolutionSettings(model="openai/gpt-4o-mini", max_tokens=10000)
+    settings = QueryResolutionSettings(model="openai/gpt-5", max_tokens=10000)
     evaluator = QueryResolutionEvaluator(settings=settings)
     result = evaluator.evaluate(conversation)
 
diff --git a/evaluators/legacy/langevals_legacy/ragas_lib/common.py b/evaluators/legacy/langevals_legacy/ragas_lib/common.py
@@ -50,7 +50,7 @@
 
 class RagasSettings(EvaluatorSettings):
     model: str = Field(
-        default="openai/gpt-4o-mini",
+        default="openai/gpt-5",
         description="The model to use for evaluation.",
     )
     embeddings_model: str = Field(
diff --git a/evaluators/ragas/langevals_ragas/lib/common.py b/evaluators/ragas/langevals_ragas/lib/common.py
@@ -29,7 +29,7 @@
 
 class RagasSettings(EvaluatorSettings):
     model: str = Field(
-        default="openai/gpt-4o-mini",
+        default="openai/gpt-5",
         description="The model to use for evaluation.",
     )
     max_tokens: int = Field(
diff --git a/evaluators/ragas/langevals_ragas/lib/model_to_langchain.py b/evaluators/ragas/langevals_ragas/lib/model_to_langchain.py
@@ -21,6 +21,7 @@ def create(self, *args, **kwargs):
         try:
             if self.temperature:
                 kwargs["temperature"] = self.temperature
+            kwargs["drop_params"] = True
             return litellm.completion(*args, **kwargs)
         except Exception as e:
             self.exception = e
@@ -39,6 +40,9 @@ def model_to_langchain(
     if model.startswith("claude-"):
         model = model.replace("claude-", "anthropic/claude-")
 
+    if "gpt-5" in model:
+        temperature = 1.0
+
     return ChatOpenAI(
         model=model,
         api_key="dummy",  # type: ignore
diff --git a/langevals_core/langevals_core/base_evaluator.py b/langevals_core/langevals_core/base_evaluator.py
@@ -45,7 +45,7 @@ class EvaluatorSettings(BaseModel):
 
 class LLMEvaluatorSettings(EvaluatorSettings):
     model: str = Field(
-        default="openai/gpt-4o-mini",
+        default="openai/gpt-5",
         description="The model to use for evaluation",
     )
     max_tokens: int = Field(
diff --git a/tests/test_azure_evaluation.py b/tests/test_azure_evaluation.py
@@ -18,7 +18,7 @@ def test_azure_evaluation_with_custom_deployment():
             model="azure/gpt-4-1106-preview",
             prompt="Is the recipe vegetarian?",
         ),
-        env={"AZURE_DEPLOYMENT_NAME": "gpt-4o-mini"},
+        env={"AZURE_DEPLOYMENT_NAME": "gpt-5"},
     )
 
     expect(output="Feta Cheese and Spinach").to_pass(vegetarian_checker)
@@ -38,7 +38,7 @@ def test_azure_evaluation_with_custom_deployment():
 
 def test_ragas_azure_evaluation_with_custom_deployment():
     answer_relevancy_checker = RagasAnswerRelevancyEvaluator(
-        settings=RagasSettings(model="azure/gpt-4o-mini"),
+        settings=RagasSettings(model="azure/gpt-5"),
         env={"AZURE_DEPLOYMENT_NAME": "gpt-4-turbo-2024-04-09"},
     )
 
diff --git a/tests/test_llm_as_judge.py b/tests/test_llm_as_judge.py
@@ -27,15 +27,15 @@
 @pytest.mark.pass_rate(0.5)
 def test_llm_as_judge(entry):
     response: ModelResponse = litellm.completion(
-        model="gpt-4o-mini",
+        model="gpt-5",
         messages=[
             {
                 "role": "system",
                 "content": "You are a tweet-size recipe generator, just recipe name and ingredients, no yapping.",
             },
             {"role": "user", "content": entry.input},
         ],
-        temperature=0.0,
+        temperature=1.0,
     )  # type: ignore
     recipe = response.choices[0].message.content  # type: ignore
 
diff --git a/tests/test_out_of_the_box_evaluators.py b/tests/test_out_of_the_box_evaluators.py
@@ -35,15 +35,15 @@
 @pytest.mark.pass_rate(0.8)
 def test_language_and_relevancy(entry):
     response: ModelResponse = litellm.completion(
-        model="gpt-3.5-turbo",
+        model="gpt-5",
         messages=[
             {
                 "role": "system",
                 "content": "You reply questions only in english, no matter tha language the question was asked",
             },
             {"role": "user", "content": entry.input},
         ],
-        temperature=0.0,
+        temperature=1.0,
     )  # type: ignore
     recipe = response.choices[0].message.content  # type: ignore
 
diff --git a/tests/test_simple_assertion.py b/tests/test_simple_assertion.py
@@ -42,7 +42,7 @@ class Address(BaseModel):
     }
 )
 
-models = ["gpt-3.5-turbo", "gpt-4-turbo", "groq/llama3-70b-8192"]
+models = ["gpt-4o-mini", "gpt-5"]
 
 client = instructor.from_litellm(completion)
 
@@ -58,7 +58,7 @@ def test_extracts_the_right_address(entry, model):
         messages=[
             {"role": "user", "content": entry.input},
         ],
-        temperature=0.0,
+        temperature=1.0,
     )
 
     assert address.model_dump_json() == entry.expected_output
diff --git a/ts-integration/evaluators.generated.ts b/ts-integration/evaluators.generated.ts

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@ def test_llm_answer_match():`
`12`	`12`	`expected_output="rock",`
`13`	`13`	`)`
`14`	`14`	`evaluator = LLMAnswerMatchEvaluator(`
`15`		`- settings=LLMAnswerMatchSettings(model="openai/gpt-4o-mini")`
	`15`	`+ settings=LLMAnswerMatchSettings(model="openai/gpt-5")`
`16`	`16`	`)`
`17`	`17`	`result = evaluator.evaluate(entry)`
`18`	`18`
`@@ -28,7 +28,7 @@ def test_llm_answer_match_without_question():`
`28`	`28`	`expected_output="rock",`
`29`	`29`	`)`
`30`	`30`	`evaluator = LLMAnswerMatchEvaluator(`
`31`		`- settings=LLMAnswerMatchSettings(model="openai/gpt-4o-mini")`
	`31`	`+ settings=LLMAnswerMatchSettings(model="openai/gpt-5")`
`32`	`32`	`)`
`33`	`33`	`result = evaluator.evaluate(entry)`
`34`	`34`
`@@ -44,7 +44,7 @@ def test_llm_answer_does_not_match_match():`
`44`	`44`	`expected_output="pop",`
`45`	`45`	`)`
`46`	`46`	`evaluator = LLMAnswerMatchEvaluator(`
`47`		`- settings=LLMAnswerMatchSettings(model="openai/gpt-4o-mini")`
	`47`	`+ settings=LLMAnswerMatchSettings(model="openai/gpt-5")`
`48`	`48`	`)`
`49`	`49`	`result = evaluator.evaluate(entry)`
`50`	`50`
Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@ def test_custom_llm_boolean_evaluator():`
`17`	`17`	`contexts=["London is the capital of France."],`
`18`	`18`	`)`
`19`	`19`	`settings = CustomLLMBooleanSettings(`
`20`		`- model="openai/gpt-4o-mini",`
	`20`	`+ model="openai/gpt-5",`
`21`	`21`	`prompt="You are an LLM evaluator. We need the guarantee that the output is using the provided context and not it's own brain, please evaluate as False if is not.",`
`22`	`22`	`)`
`23`	`23`
`@@ -38,7 +38,7 @@ def test_custom_llm_boolean_evaluator_skips_if_context_is_too_large():`
`38`	`38`	`contexts=["London is the capital of France."] * 300,`
`39`	`39`	`)`
`40`	`40`	`settings = CustomLLMBooleanSettings(`
`41`		`- model="openai/gpt-4o-mini",`
	`41`	`+ model="openai/gpt-5",`
`42`	`42`	`prompt="You are an LLM evaluator. We need the guarantee that the output is using the provided context and not it's own brain, please evaluate as False if is not.",`
`43`	`43`	`max_tokens=2048,`
`44`	`44`	`)`
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ def test_off_topic_evaluator():`
`18`	`18`	`AllowedTopic(topic="email_delete", description="Delete an email"),`
`19`	`19`	`AllowedTopic(topic="email_write", description="Write an email"),`
`20`	`20`	`],`
`21`		`- model="openai/gpt-4o-mini"`
	`21`	`+ model="openai/gpt-5"`
`22`	`22`	`)`
`23`	`23`	`evaluator = OffTopicEvaluator(settings=settings)`
`24`	`24`	`result = evaluator.evaluate(entry)`
Original file line number	Diff line number	Diff line change
`@@ -50,7 +50,7 @@`
`50`	`50`
`51`	`51`	`class RagasSettings(EvaluatorSettings):`
`52`	`52`	`model: str = Field(`
`53`		`- default="openai/gpt-4o-mini",`
	`53`	`+ default="openai/gpt-5",`
`54`	`54`	`description="The model to use for evaluation.",`
`55`	`55`	`)`
`56`	`56`	`embeddings_model: str = Field(`
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@`
`29`	`29`
`30`	`30`	`class RagasSettings(EvaluatorSettings):`
`31`	`31`	`model: str = Field(`
`32`		`- default="openai/gpt-4o-mini",`
	`32`	`+ default="openai/gpt-5",`
`33`	`33`	`description="The model to use for evaluation.",`
`34`	`34`	`)`
`35`	`35`	`max_tokens: int = Field(`
Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ class EvaluatorSettings(BaseModel):`
`45`	`45`
`46`	`46`	`class LLMEvaluatorSettings(EvaluatorSettings):`
`47`	`47`	`model: str = Field(`
`48`		`- default="openai/gpt-4o-mini",`
	`48`	`+ default="openai/gpt-5",`
`49`	`49`	`description="The model to use for evaluation",`
`50`	`50`	`)`
`51`	`51`	`max_tokens: int = Field(`
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ def test_azure_evaluation_with_custom_deployment():`
`18`	`18`	`model="azure/gpt-4-1106-preview",`
`19`	`19`	`prompt="Is the recipe vegetarian?",`
`20`	`20`	`),`
`21`		`- env={"AZURE_DEPLOYMENT_NAME": "gpt-4o-mini"},`
	`21`	`+ env={"AZURE_DEPLOYMENT_NAME": "gpt-5"},`
`22`	`22`	`)`
`23`	`23`
`24`	`24`	`expect(output="Feta Cheese and Spinach").to_pass(vegetarian_checker)`
`@@ -38,7 +38,7 @@ def test_azure_evaluation_with_custom_deployment():`
`38`	`38`
`39`	`39`	`def test_ragas_azure_evaluation_with_custom_deployment():`
`40`	`40`	`answer_relevancy_checker = RagasAnswerRelevancyEvaluator(`
`41`		`- settings=RagasSettings(model="azure/gpt-4o-mini"),`
	`41`	`+ settings=RagasSettings(model="azure/gpt-5"),`
`42`	`42`	`env={"AZURE_DEPLOYMENT_NAME": "gpt-4-turbo-2024-04-09"},`
`43`	`43`	`)`
`44`	`44`