Skip to content

Commit 135c74b

Browse files
committed
feat: bump it all to gpt-5 as defaults
1 parent ccc8280 commit 135c74b

File tree

14 files changed

+74
-70
lines changed

14 files changed

+74
-70
lines changed

evaluators/langevals/tests/test_llm_answer_match.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ def test_llm_answer_match():
1212
expected_output="rock",
1313
)
1414
evaluator = LLMAnswerMatchEvaluator(
15-
settings=LLMAnswerMatchSettings(model="openai/gpt-4o-mini")
15+
settings=LLMAnswerMatchSettings(model="openai/gpt-5")
1616
)
1717
result = evaluator.evaluate(entry)
1818

@@ -28,7 +28,7 @@ def test_llm_answer_match_without_question():
2828
expected_output="rock",
2929
)
3030
evaluator = LLMAnswerMatchEvaluator(
31-
settings=LLMAnswerMatchSettings(model="openai/gpt-4o-mini")
31+
settings=LLMAnswerMatchSettings(model="openai/gpt-5")
3232
)
3333
result = evaluator.evaluate(entry)
3434

@@ -44,7 +44,7 @@ def test_llm_answer_does_not_match_match():
4444
expected_output="pop",
4545
)
4646
evaluator = LLMAnswerMatchEvaluator(
47-
settings=LLMAnswerMatchSettings(model="openai/gpt-4o-mini")
47+
settings=LLMAnswerMatchSettings(model="openai/gpt-5")
4848
)
4949
result = evaluator.evaluate(entry)
5050

evaluators/langevals/tests/test_llm_boolean.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def test_custom_llm_boolean_evaluator():
1717
contexts=["London is the capital of France."],
1818
)
1919
settings = CustomLLMBooleanSettings(
20-
model="openai/gpt-4o-mini",
20+
model="openai/gpt-5",
2121
prompt="You are an LLM evaluator. We need the guarantee that the output is using the provided context and not it's own brain, please evaluate as False if is not.",
2222
)
2323

@@ -38,7 +38,7 @@ def test_custom_llm_boolean_evaluator_skips_if_context_is_too_large():
3838
contexts=["London is the capital of France."] * 300,
3939
)
4040
settings = CustomLLMBooleanSettings(
41-
model="openai/gpt-4o-mini",
41+
model="openai/gpt-5",
4242
prompt="You are an LLM evaluator. We need the guarantee that the output is using the provided context and not it's own brain, please evaluate as False if is not.",
4343
max_tokens=2048,
4444
)

evaluators/langevals/tests/test_llm_category.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def test_custom_llm_category_evaluator():
1818
contexts=["London is the capital of France."],
1919
)
2020
settings = CustomLLMCategorySettings(
21-
model="openai/gpt-4o-mini",
21+
model="openai/gpt-5",
2222
prompt="You are an LLM category evaluator. Please categorize the answer in one of the following categories",
2323
categories=[
2424
CustomLLMCategoryDefinition(
@@ -48,7 +48,7 @@ def test_custom_llm_category_evaluator_skips_if_context_is_too_large():
4848
contexts=["London is the capital of France."] * 300,
4949
)
5050
settings = CustomLLMCategorySettings(
51-
model="openai/gpt-4o-mini",
51+
model="openai/gpt-5",
5252
prompt="You are an LLM category evaluator. Please categorize the answer in one of the following categories",
5353
categories=[
5454
CustomLLMCategoryDefinition(

evaluators/langevals/tests/test_off_topic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def test_off_topic_evaluator():
1818
AllowedTopic(topic="email_delete", description="Delete an email"),
1919
AllowedTopic(topic="email_write", description="Write an email"),
2020
],
21-
model="openai/gpt-4o-mini"
21+
model="openai/gpt-5"
2222
)
2323
evaluator = OffTopicEvaluator(settings=settings)
2424
result = evaluator.evaluate(entry)

evaluators/langevals/tests/test_query_resolution.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def test_query_resolution_conversation_evaluator_pass_for_simple_greetings():
1818
output="Hello, I am an assistant and I don't have feelings",
1919
)
2020
conversation = QueryResolutionEntry(conversation=[response1])
21-
settings = QueryResolutionSettings(model="openai/gpt-4o-mini", max_tokens=10000)
21+
settings = QueryResolutionSettings(model="openai/gpt-5", max_tokens=10000)
2222
evaluator = QueryResolutionEvaluator(settings=settings)
2323
result = evaluator.evaluate(conversation)
2424

@@ -37,7 +37,7 @@ def test_query_resolution_conversation_evaluator_pass():
3737
output="There is no president in the Netherlands. The system of government is constitutional monarchy.",
3838
)
3939
conversation = QueryResolutionEntry(conversation=[response1, response2])
40-
settings = QueryResolutionSettings(model="openai/gpt-4o-mini", max_tokens=10000)
40+
settings = QueryResolutionSettings(model="openai/gpt-5", max_tokens=10000)
4141
evaluator = QueryResolutionEvaluator(settings=settings)
4242
result = evaluator.evaluate(conversation)
4343

@@ -57,7 +57,7 @@ def test_query_resolution_conversation_evaluator_fail():
5757
output="There is no president in the Netherlands.",
5858
)
5959
conversation = QueryResolutionEntry(conversation=[response1, response2])
60-
settings = QueryResolutionSettings(model="openai/gpt-4o-mini", max_tokens=10000)
60+
settings = QueryResolutionSettings(model="openai/gpt-5", max_tokens=10000)
6161
evaluator = QueryResolutionEvaluator(settings=settings)
6262
result = evaluator.evaluate(conversation)
6363

@@ -73,7 +73,7 @@ def test_query_resolution_conversation_evaluator_fails_with_i_dont_know():
7373
output="Sorry, I don't have any information about the current time",
7474
)
7575
conversation = QueryResolutionEntry(conversation=[response1])
76-
settings = QueryResolutionSettings(model="openai/gpt-4o-mini", max_tokens=10000)
76+
settings = QueryResolutionSettings(model="openai/gpt-5", max_tokens=10000)
7777
evaluator = QueryResolutionEvaluator(settings=settings)
7878
result = evaluator.evaluate(conversation)
7979

@@ -87,7 +87,7 @@ def test_product_sentiment_polarity_evaluator_skipped_for_non_product_related_ou
8787
response1 = ConversationEntry(input="", output="")
8888
response2 = ConversationEntry(input="", output="")
8989
conversation = QueryResolutionEntry(conversation=[response1, response2])
90-
settings = QueryResolutionSettings(model="openai/gpt-4o-mini", max_tokens=10000)
90+
settings = QueryResolutionSettings(model="openai/gpt-5", max_tokens=10000)
9191
evaluator = QueryResolutionEvaluator(settings=settings)
9292
result = evaluator.evaluate(conversation)
9393

evaluators/legacy/langevals_legacy/ragas_lib/common.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050

5151
class RagasSettings(EvaluatorSettings):
5252
model: str = Field(
53-
default="openai/gpt-4o-mini",
53+
default="openai/gpt-5",
5454
description="The model to use for evaluation.",
5555
)
5656
embeddings_model: str = Field(

evaluators/ragas/langevals_ragas/lib/common.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929

3030
class RagasSettings(EvaluatorSettings):
3131
model: str = Field(
32-
default="openai/gpt-4o-mini",
32+
default="openai/gpt-5",
3333
description="The model to use for evaluation.",
3434
)
3535
max_tokens: int = Field(

evaluators/ragas/langevals_ragas/lib/model_to_langchain.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ def create(self, *args, **kwargs):
2121
try:
2222
if self.temperature:
2323
kwargs["temperature"] = self.temperature
24+
kwargs["drop_params"] = True
2425
return litellm.completion(*args, **kwargs)
2526
except Exception as e:
2627
self.exception = e
@@ -39,6 +40,9 @@ def model_to_langchain(
3940
if model.startswith("claude-"):
4041
model = model.replace("claude-", "anthropic/claude-")
4142

43+
if "gpt-5" in model:
44+
temperature = 1.0
45+
4246
return ChatOpenAI(
4347
model=model,
4448
api_key="dummy", # type: ignore

langevals_core/langevals_core/base_evaluator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ class EvaluatorSettings(BaseModel):
4545

4646
class LLMEvaluatorSettings(EvaluatorSettings):
4747
model: str = Field(
48-
default="openai/gpt-4o-mini",
48+
default="openai/gpt-5",
4949
description="The model to use for evaluation",
5050
)
5151
max_tokens: int = Field(

tests/test_azure_evaluation.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ def test_azure_evaluation_with_custom_deployment():
1818
model="azure/gpt-4-1106-preview",
1919
prompt="Is the recipe vegetarian?",
2020
),
21-
env={"AZURE_DEPLOYMENT_NAME": "gpt-4o-mini"},
21+
env={"AZURE_DEPLOYMENT_NAME": "gpt-5"},
2222
)
2323

2424
expect(output="Feta Cheese and Spinach").to_pass(vegetarian_checker)
@@ -38,7 +38,7 @@ def test_azure_evaluation_with_custom_deployment():
3838

3939
def test_ragas_azure_evaluation_with_custom_deployment():
4040
answer_relevancy_checker = RagasAnswerRelevancyEvaluator(
41-
settings=RagasSettings(model="azure/gpt-4o-mini"),
41+
settings=RagasSettings(model="azure/gpt-5"),
4242
env={"AZURE_DEPLOYMENT_NAME": "gpt-4-turbo-2024-04-09"},
4343
)
4444

0 commit comments

Comments
 (0)