Skip to content

Commit e4abef3

Browse files
committed
feat: allow users to customize the llm answer match prompt
1 parent fd84082 commit e4abef3

File tree

2 files changed

+17
-4
lines changed

2 files changed

+17
-4
lines changed

evaluators/langevals/langevals_langevals/llm_answer_match.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,10 @@ class LLMAnswerMatchEntry(EvaluatorEntry):
2222

2323

2424
class LLMAnswerMatchSettings(LLMEvaluatorSettings):
25-
pass
25+
prompt: str = Field(
26+
default="Verify that the predicted answer matches the gold answer for the question. Style does not matter, for example the gold answer may be more direct while the predicted answer more verbose and still be correct.",
27+
description="Prompt for the comparison",
28+
)
2629

2730

2831
class LLMAnswerMatchResult(EvaluationResult):
@@ -33,8 +36,6 @@ class LLMAnswerMatchResult(EvaluationResult):
3336

3437

3538
class LLMAnswerMatchSignature(dspy.Signature):
36-
"""Verify that the predicted answer matches the gold answer for the question. Style does not matter, for example the gold answer may be more direct while the predicted answer more verbose and still be correct."""
37-
3839
question = dspy.InputField()
3940
gold_answer = dspy.InputField(desc="correct answer for question")
4041
predicted_answer = dspy.InputField(desc="predicted answer for question")
@@ -74,7 +75,9 @@ def evaluate(self, entry: LLMAnswerMatchEntry) -> SingleEvaluationResult:
7475
lm = model_to_dspy_lm(self.settings.model)
7576
dspy.settings.configure(experimental=True)
7677

77-
answer_match = dspy.Predict(LLMAnswerMatchSignature)
78+
answer_match = dspy.Predict(
79+
LLMAnswerMatchSignature.with_instructions(self.settings.prompt)
80+
)
7881
answer_match.set_lm(lm)
7982

8083
result = answer_match(

ts-integration/evaluators.generated.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,11 @@ export type Evaluators = {
215215
* @default 8192
216216
*/
217217
max_tokens: number;
218+
/**
219+
* @description Prompt for the comparison
220+
* @default "Verify that the predicted answer matches the gold answer for the question. Style does not matter, for example the gold answer may be more direct while the predicted answer more verbose and still be correct."
221+
*/
222+
prompt: string;
218223
};
219224
};
220225
"langevals/llm_boolean": {
@@ -1223,6 +1228,11 @@ Uses an LLM to check if the generated output answers a question correctly the sa
12231228
description: "Max tokens allowed for evaluation",
12241229
default: 8192,
12251230
},
1231+
prompt: {
1232+
description: "Prompt for the comparison",
1233+
default:
1234+
"Verify that the predicted answer matches the gold answer for the question. Style does not matter, for example the gold answer may be more direct while the predicted answer more verbose and still be correct.",
1235+
},
12261236
},
12271237
envVars: [],
12281238
result: {

0 commit comments

Comments
 (0)