feat: allow users to customize the llm answer match prompt

rogeriochaves · rogeriochaves · commit e4abef3d77d5 · 2025-07-10T21:51:01.000+02:00
diff --git a/evaluators/langevals/langevals_langevals/llm_answer_match.py b/evaluators/langevals/langevals_langevals/llm_answer_match.py
@@ -22,7 +22,10 @@ class LLMAnswerMatchEntry(EvaluatorEntry):
 
 
 class LLMAnswerMatchSettings(LLMEvaluatorSettings):
-    pass
+    prompt: str = Field(
+        default="Verify that the predicted answer matches the gold answer for the question. Style does not matter, for example the gold answer may be more direct while the predicted answer more verbose and still be correct.",
+        description="Prompt for the comparison",
+    )
 
 
 class LLMAnswerMatchResult(EvaluationResult):
@@ -33,8 +36,6 @@ class LLMAnswerMatchResult(EvaluationResult):
 
 
 class LLMAnswerMatchSignature(dspy.Signature):
-    """Verify that the predicted answer matches the gold answer for the question. Style does not matter, for example the gold answer may be more direct while the predicted answer more verbose and still be correct."""
-
     question = dspy.InputField()
     gold_answer = dspy.InputField(desc="correct answer for question")
     predicted_answer = dspy.InputField(desc="predicted answer for question")
@@ -74,7 +75,9 @@ def evaluate(self, entry: LLMAnswerMatchEntry) -> SingleEvaluationResult:
         lm = model_to_dspy_lm(self.settings.model)
         dspy.settings.configure(experimental=True)
 
-        answer_match = dspy.Predict(LLMAnswerMatchSignature)
+        answer_match = dspy.Predict(
+            LLMAnswerMatchSignature.with_instructions(self.settings.prompt)
+        )
         answer_match.set_lm(lm)
 
         result = answer_match(
diff --git a/ts-integration/evaluators.generated.ts b/ts-integration/evaluators.generated.ts
@@ -215,6 +215,11 @@ export type Evaluators = {
        * @default 8192
        */
       max_tokens: number;
+      /**
+       * @description Prompt for the comparison
+       * @default "Verify that the predicted answer matches the gold answer for the question. Style does not matter, for example the gold answer may be more direct while the predicted answer more verbose and still be correct."
+       */
+      prompt: string;
     };
   };
   "langevals/llm_boolean": {
@@ -1223,6 +1228,11 @@ Uses an LLM to check if the generated output answers a question correctly the sa
         description: "Max tokens allowed for evaluation",
         default: 8192,
       },
+      prompt: {
+        description: "Prompt for the comparison",
+        default:
+          "Verify that the predicted answer matches the gold answer for the question. Style does not matter, for example the gold answer may be more direct while the predicted answer more verbose and still be correct.",
+      },
     },
     envVars: [],
     result: {