Skip to content

Commit 1f3f4ad

Browse files
committed
Add support for atla as llm as a judge using text-only tools
1 parent 968e7ca commit 1f3f4ad

File tree

7 files changed

+258
-139
lines changed

7 files changed

+258
-139
lines changed

evaluators/langevals/langevals_langevals/llm_answer_match.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ class LLMAnswerMatchEvaluator(
5858

5959
def evaluate(self, entry: LLMAnswerMatchEntry) -> SingleEvaluationResult:
6060
total_tokens = len(
61-
litellm.encode(
61+
litellm.encode( # type: ignore
6262
model=self.settings.model,
6363
text=f"{entry.input} {entry.output} {entry.expected_output}",
6464
)

evaluators/langevals/langevals_langevals/llm_boolean.py

Lines changed: 56 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from litellm import Choices, Message
1717
from litellm.files.main import ModelResponse
1818
from litellm.cost_calculator import completion_cost
19+
import dspy
1920

2021

2122
class CustomLLMBooleanEntry(EvaluatorEntry):
@@ -33,7 +34,7 @@ class CustomLLMBooleanSettings(LLMEvaluatorSettings):
3334

3435

3536
class CustomLLMBooleanResult(EvaluationResult):
36-
score: float
37+
score: float = Field(default=0.0)
3738
passed: Optional[bool] = Field(
3839
description="The veredict given by the LLM", default=True
3940
)
@@ -74,7 +75,7 @@ def evaluate(self, entry: CustomLLMBooleanEntry) -> SingleEvaluationResult:
7475
content += f"# Task\n{self.settings.prompt}"
7576

7677
total_tokens = len(
77-
litellm.encode(
78+
litellm.encode( # type: ignore
7879
model=self.settings.model, text=f"{self.settings.prompt} {content}"
7980
)
8081
)
@@ -86,55 +87,67 @@ def evaluate(self, entry: CustomLLMBooleanEntry) -> SingleEvaluationResult:
8687

8788
cost = None
8889

89-
response = litellm.completion(
90-
model=self.settings.model,
91-
messages=[
92-
{
93-
"role": "system",
94-
"content": self.settings.prompt
95-
+ ". Always output a valid json for the function call",
96-
},
97-
{
98-
"role": "user",
99-
"content": content,
100-
},
101-
],
102-
tools=[
103-
{
104-
"type": "function",
105-
"function": {
106-
"name": "evaluation",
107-
"parameters": {
108-
"type": "object",
109-
"properties": {
110-
"scratchpad": {
111-
"type": "string",
112-
"description": "use this field to ponder and write a short reasoning behind the decision written before a result is actually given",
113-
},
114-
"passed": {
115-
"type": "boolean",
116-
"description": "your final veredict, reply true or false if the content passes the test or not",
90+
if "atla-selene" in self.settings.model:
91+
92+
class LLMJudge(dspy.Signature):
93+
content: str = dspy.InputField()
94+
reasoning: str = dspy.OutputField()
95+
passed: bool = dspy.OutputField()
96+
97+
judge = dspy.Predict(LLMJudge.with_instructions(self.settings.prompt))
98+
judge.set_lm(lm=dspy.LM(model=self.settings.model))
99+
arguments = judge(content=content)
100+
101+
else:
102+
response = litellm.completion(
103+
model=self.settings.model,
104+
messages=[
105+
{
106+
"role": "system",
107+
"content": self.settings.prompt
108+
+ ". Always output a valid json for the function call",
109+
},
110+
{
111+
"role": "user",
112+
"content": content,
113+
},
114+
],
115+
tools=[
116+
{
117+
"type": "function",
118+
"function": {
119+
"name": "evaluation",
120+
"parameters": {
121+
"type": "object",
122+
"properties": {
123+
"reasoning": {
124+
"type": "string",
125+
"description": "use this field to ponder and write a short reasoning behind the decision written before a result is actually given",
126+
},
127+
"passed": {
128+
"type": "boolean",
129+
"description": "your final veredict, reply true or false if the content passes the test or not",
130+
},
117131
},
132+
"required": ["reasoning", "passed"],
118133
},
119-
"required": ["scratchpad", "passed"],
134+
"description": "use this function to write your thoughts on the reasoning, then decide if it passed or not with this json structure",
120135
},
121-
"description": "use this function to write your thoughts on the scratchpad, then decide if it passed or not with this json structure",
122136
},
123-
},
124-
],
125-
tool_choice={"type": "function", "function": {"name": "evaluation"}}, # type: ignore
126-
)
137+
],
138+
tool_choice={"type": "function", "function": {"name": "evaluation"}}, # type: ignore
139+
)
127140

128-
response = cast(ModelResponse, response)
129-
choice = cast(Choices, response.choices[0])
130-
arguments = json.loads(
131-
cast(Message, choice.message).tool_calls[0].function.arguments
132-
)
133-
cost = completion_cost(completion_response=response)
141+
response = cast(ModelResponse, response)
142+
choice = cast(Choices, response.choices[0])
143+
arguments = json.loads(
144+
cast(Message, choice.message).tool_calls[0].function.arguments # type: ignore
145+
)
146+
cost = completion_cost(completion_response=response)
134147

135148
return CustomLLMBooleanResult(
136149
score=1 if arguments["passed"] else 0,
137150
passed=arguments["passed"],
138-
details=arguments["scratchpad"],
151+
details=arguments["reasoning"],
139152
cost=Money(amount=cost, currency="USD") if cost else None,
140153
)

evaluators/langevals/langevals_langevals/llm_category.py

Lines changed: 65 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from litellm.types.utils import ModelResponse
1818
from litellm.cost_calculator import completion_cost
1919
from litellm.utils import encode
20+
import dspy
2021

2122

2223
class CustomLLMCategoryEntry(EvaluatorEntry):
@@ -109,58 +110,77 @@ def evaluate(self, entry: CustomLLMCategoryEntry) -> SingleEvaluationResult:
109110

110111
cost = None
111112

112-
response = litellm.completion(
113-
model=self.settings.model,
114-
messages=[
115-
{
116-
"role": "system",
117-
"content": self.settings.prompt
118-
+ ". Always output a valid json for the function call",
119-
},
120-
{
121-
"role": "user",
122-
"content": content,
123-
},
124-
],
125-
tools=[
126-
{
127-
"type": "function",
128-
"function": {
129-
"name": "evaluation",
130-
"parameters": {
131-
"type": "object",
132-
"properties": {
133-
"scratchpad": {
134-
"type": "string",
135-
"description": "use this field to ponder and write a short reasoning behind the decision written before a result is actually given",
136-
},
137-
"label": {
138-
"type": "string",
139-
"description": "the final decision of the category for the message",
140-
"enum": [
141-
category.name
142-
for category in self.settings.categories
143-
],
113+
if "atla-selene" in self.settings.model:
114+
# Workaround to get the Literal type for the categories at runtime
115+
category_names = [
116+
f'"{category.name}"' for category in self.settings.categories
117+
]
118+
type_str = f"Literal[{', '.join(category_names)}]"
119+
locals_dict = {"Literal": Literal}
120+
type_ = eval(type_str, globals(), locals_dict)
121+
122+
class LLMJudge(dspy.Signature):
123+
content: str = dspy.InputField()
124+
reasoning: str = dspy.OutputField()
125+
label: type_ = dspy.OutputField() # type: ignore
126+
127+
judge = dspy.Predict(LLMJudge.with_instructions(self.settings.prompt))
128+
judge.set_lm(lm=dspy.LM(model=self.settings.model))
129+
arguments = judge(content=content)
130+
131+
else:
132+
response = litellm.completion(
133+
model=self.settings.model,
134+
messages=[
135+
{
136+
"role": "system",
137+
"content": self.settings.prompt
138+
+ ". Always output a valid json for the function call",
139+
},
140+
{
141+
"role": "user",
142+
"content": content,
143+
},
144+
],
145+
tools=[
146+
{
147+
"type": "function",
148+
"function": {
149+
"name": "evaluation",
150+
"parameters": {
151+
"type": "object",
152+
"properties": {
153+
"reasoning": {
154+
"type": "string",
155+
"description": "use this field to ponder and write a short reasoning behind the decision written before a result is actually given",
156+
},
157+
"label": {
158+
"type": "string",
159+
"description": "the final decision of the category for the message",
160+
"enum": [
161+
category.name
162+
for category in self.settings.categories
163+
],
164+
},
144165
},
166+
"required": ["reasoning", "label"],
145167
},
146-
"required": ["scratchpad", "label"],
168+
"description": "use this function to write your thoughts on the reasoning, then decide if it passed or not with this json structure",
147169
},
148-
"description": "use this function to write your thoughts on the scratchpad, then decide if it passed or not with this json structure",
149170
},
150-
},
151-
],
152-
tool_choice={"type": "function", "function": {"name": "evaluation"}}, # type: ignore
153-
)
171+
],
172+
tool_choice={"type": "function", "function": {"name": "evaluation"}}, # type: ignore
173+
)
154174

155-
response = cast(ModelResponse, response)
156-
choice = cast(Choices, response.choices[0])
157-
arguments = json.loads(
158-
cast(Message, choice.message).tool_calls[0].function.arguments # type: ignore
159-
)
160-
cost = completion_cost(completion_response=response)
175+
response = cast(ModelResponse, response)
176+
choice = cast(Choices, response.choices[0])
177+
arguments = json.loads(
178+
cast(Message, choice.message).tool_calls[0].function.arguments # type: ignore
179+
)
180+
cost = completion_cost(completion_response=response)
161181

162182
return CustomLLMCategoryResult(
163183
label=arguments["label"],
164-
details=arguments["scratchpad"],
184+
details=arguments["reasoning"],
165185
cost=Money(amount=cost, currency="USD") if cost else None,
166186
)

0 commit comments

Comments
 (0)