Skip to content

Commit a7a8c9f

Browse files
committed
Allow empty input/output on the conversations
1 parent 83f26c7 commit a7a8c9f

File tree

4 files changed

+28
-70
lines changed

4 files changed

+28
-70
lines changed

evaluators/langevals/langevals_langevals/query_resolution.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from langevals_core.base_evaluator import (
1313
BaseEvaluator,
1414
EvaluatorEntry,
15+
ConversationEntry,
1516
EvaluationResult,
1617
EvaluatorSettings,
1718
LLMEvaluatorSettings,
@@ -21,13 +22,8 @@
2122
)
2223

2324

24-
class QueryResolutionConversationEntry(EvaluatorEntry):
25-
input: str
26-
output: str
27-
28-
2925
class QueryResolutionEntry(EvaluatorEntry):
30-
conversation: List[QueryResolutionConversationEntry]
26+
conversation: List[ConversationEntry]
3127

3228

3329
class QueryResolutionSettings(LLMEvaluatorSettings):

evaluators/langevals/tests/test_query_resolution.py

Lines changed: 15 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22

33
dotenv.load_dotenv()
44

5+
from langevals_core.base_evaluator import ConversationEntry
6+
57
from langevals_langevals.query_resolution import (
6-
QueryResolutionConversationEntry,
78
QueryResolutionEntry,
89
QueryResolutionSettings,
910
QueryResolutionResult,
@@ -12,14 +13,12 @@
1213

1314

1415
def test_query_resolution_conversation_evaluator_pass_for_simple_greetings():
15-
response1 = QueryResolutionConversationEntry(
16+
response1 = ConversationEntry(
1617
input="Hey, how are you?",
1718
output="Hello, I am an assistant and I don't have feelings",
1819
)
1920
conversation = QueryResolutionEntry(conversation=[response1])
20-
settings = QueryResolutionSettings(
21-
model="openai/gpt-4o-mini", max_tokens=10000
22-
)
21+
settings = QueryResolutionSettings(model="openai/gpt-4o-mini", max_tokens=10000)
2322
evaluator = QueryResolutionEvaluator(settings=settings)
2423
result = evaluator.evaluate(conversation)
2524

@@ -30,18 +29,16 @@ def test_query_resolution_conversation_evaluator_pass_for_simple_greetings():
3029

3130

3231
def test_query_resolution_conversation_evaluator_pass():
33-
response1 = QueryResolutionConversationEntry(
32+
response1 = ConversationEntry(
3433
input="Hey, how are you?",
3534
output="Hello, I am an assistant and I don't have feelings",
3635
)
37-
response2 = QueryResolutionConversationEntry(
36+
response2 = ConversationEntry(
3837
input="Okay, is there a president in the Netherlands? Also, tell me what is the system of government in the Netherlands?",
3938
output="There is no president in the Netherlands. The system of government is constitutional monarchy.",
4039
)
4140
conversation = QueryResolutionEntry(conversation=[response1, response2])
42-
settings = QueryResolutionSettings(
43-
model="openai/gpt-4o-mini", max_tokens=10000
44-
)
41+
settings = QueryResolutionSettings(model="openai/gpt-4o-mini", max_tokens=10000)
4542
evaluator = QueryResolutionEvaluator(settings=settings)
4643
result = evaluator.evaluate(conversation)
4744

@@ -52,18 +49,16 @@ def test_query_resolution_conversation_evaluator_pass():
5249

5350

5451
def test_query_resolution_conversation_evaluator_fail():
55-
response1 = QueryResolutionConversationEntry(
52+
response1 = ConversationEntry(
5653
input="Hey, how are you?",
5754
output="Hello, I am an assistant and I don't have feelings",
5855
)
59-
response2 = QueryResolutionConversationEntry(
56+
response2 = ConversationEntry(
6057
input="Okay, is there a president in the Netherlands? Also, what equals 2 + 2? How many paws does a standard dog have?",
6158
output="There is no president in the Netherlands.",
6259
)
6360
conversation = QueryResolutionEntry(conversation=[response1, response2])
64-
settings = QueryResolutionSettings(
65-
model="openai/gpt-4o-mini", max_tokens=10000
66-
)
61+
settings = QueryResolutionSettings(model="openai/gpt-4o-mini", max_tokens=10000)
6762
evaluator = QueryResolutionEvaluator(settings=settings)
6863
result = evaluator.evaluate(conversation)
6964

@@ -74,14 +69,12 @@ def test_query_resolution_conversation_evaluator_fail():
7469

7570

7671
def test_query_resolution_conversation_evaluator_fails_with_i_dont_know():
77-
response1 = QueryResolutionConversationEntry(
72+
response1 = ConversationEntry(
7873
input="What time is it?",
7974
output="Sorry, I don't have any information about the current time",
8075
)
8176
conversation = QueryResolutionEntry(conversation=[response1])
82-
settings = QueryResolutionSettings(
83-
model="openai/gpt-4o-mini", max_tokens=10000
84-
)
77+
settings = QueryResolutionSettings(model="openai/gpt-4o-mini", max_tokens=10000)
8578
evaluator = QueryResolutionEvaluator(settings=settings)
8679
result = evaluator.evaluate(conversation)
8780

@@ -92,12 +85,10 @@ def test_query_resolution_conversation_evaluator_fails_with_i_dont_know():
9285

9386

9487
def test_product_sentiment_polarity_evaluator_skipped_for_non_product_related_outputs():
95-
response1 = QueryResolutionConversationEntry(input="", output="")
96-
response2 = QueryResolutionConversationEntry(input="", output="")
88+
response1 = ConversationEntry(input="", output="")
89+
response2 = ConversationEntry(input="", output="")
9790
conversation = QueryResolutionEntry(conversation=[response1, response2])
98-
settings = QueryResolutionSettings(
99-
model="openai/gpt-4o-mini", max_tokens=10000
100-
)
91+
settings = QueryResolutionSettings(model="openai/gpt-4o-mini", max_tokens=10000)
10192
evaluator = QueryResolutionEvaluator(settings=settings)
10293
result = evaluator.evaluate(conversation)
10394

langevals_core/langevals_core/base_evaluator.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,14 @@
2828
"quality", "rag", "safety", "policy", "other", "custom", "similarity"
2929
]
3030

31+
3132
class EvaluatorSettings(BaseModel):
32-
pass
33+
pass
34+
3335

3436
TSettings = TypeVar("TSettings", bound=EvaluatorSettings)
3537

38+
3639
class LLMEvaluatorSettings(EvaluatorSettings):
3740
model: Literal[
3841
"openai/gpt-3.5-turbo",
@@ -60,6 +63,12 @@ class LLMEvaluatorSettings(EvaluatorSettings):
6063
description="Max tokens allowed for evaluation",
6164
)
6265

66+
67+
class ConversationEntry(BaseModel):
68+
input: str = Field(default="")
69+
output: str = Field(default="")
70+
71+
6372
class EvaluatorEntry(BaseModel):
6473
"""
6574
Entry datapoint for an evaluator, it should contain all the necessary information for the evaluator to run.
@@ -78,7 +87,7 @@ def __init_subclass__(cls, **kwargs):
7887
super().__init_subclass__(**kwargs) # Always call super()!
7988

8089
required_fields_types = {
81-
"conversation": [EvaluatorEntry, Optional[EvaluatorEntry]],
90+
"conversation": [ConversationEntry, Optional[ConversationEntry]],
8291
"input": [str, Optional[str]],
8392
"output": [str, Optional[str]],
8493
"contexts": [

0 commit comments

Comments
 (0)