Change ragas to use gpt-4o-mini by default, as it's good on it now, and better document the relevant outputs for each evaluator

rogeriochaves · rogeriochaves · commit e98fc9d1fe5f · 2024-10-11T14:33:33.000+02:00
diff --git a/evaluators/langevals/langevals_langevals/basic.py b/evaluators/langevals/langevals_langevals/basic.py
@@ -27,14 +27,21 @@ class CustomBasicRule(BaseModel):
 
 
 class CustomBasicSettings(EvaluatorSettings):
-    rules: list[CustomBasicRule] = Field(default=[
-        CustomBasicRule(field="output", rule="not_contains", value="artificial intelligence"),
-    ], description="List of rules to check, the message must pass all of them")
+    rules: list[CustomBasicRule] = Field(
+        default=[
+            CustomBasicRule(
+                field="output", rule="not_contains", value="artificial intelligence"
+            ),
+        ],
+        description="List of rules to check, the message must pass all of them",
+    )
 
 
 class CustomBasicResult(EvaluationResult):
-    score: float = Field(description="Returns 1 if all rules pass, 0 if any rule fails")
-    passed: Optional[bool] = Field(default=True)
+    score: float
+    passed: Optional[bool] = Field(
+        default=True, description="True if all rules pass, False if any rule fails"
+    )
 
 
 class CustomBasicEvaluator(
diff --git a/evaluators/langevals/langevals_langevals/llm_boolean.py b/evaluators/langevals/langevals_langevals/llm_boolean.py
@@ -33,10 +33,10 @@ class CustomLLMBooleanSettings(LLMEvaluatorSettings):
 
 
 class CustomLLMBooleanResult(EvaluationResult):
-    score: float = Field(
-        description="Returns 1 if LLM evaluates it as true, 0 if as false"
+    score: float
+    passed: Optional[bool] = Field(
+        description="The veredict given by the LLM", default=True
     )
-    passed: Optional[bool] = Field(description="The veredict given by the LLM", default=True)
 
 
 class CustomLLMBooleanEvaluator(
diff --git a/evaluators/lingua/langevals_lingua/language_detection.py b/evaluators/lingua/langevals_lingua/language_detection.py
@@ -119,7 +119,7 @@ class LinguaLanguageDetectionRawResponse(BaseModel):
 
 
 class LinguaLanguageDetectionResult(EvaluationResult):
-    score: float = Field(description="How many languages were detected")
+    score: float
     passed: Optional[bool] = Field(
         description="Passes if the detected language on the output matches the detected language on the input, or if the output matches the expected language",
         default=None,
diff --git a/evaluators/openai/langevals_openai/moderation.py b/evaluators/openai/langevals_openai/moderation.py
@@ -45,6 +45,10 @@ class OpenAIModerationSettings(EvaluatorSettings):
 
 
 class OpenAIModerationResult(EvaluationResult):
+    passed: Optional[bool] = Field(
+        description="Fails if any moderation category is flagged",
+        default=None,
+    )
     score: float = Field(
         description="The model's confidence on primary category where the input violates the OpenAI's policy. The value is between 0 and 1, where higher values denote higher confidence."
     )
diff --git a/evaluators/ragas/langevals_ragas/answer_correctness.py b/evaluators/ragas/langevals_ragas/answer_correctness.py
@@ -1,23 +1,32 @@
+from typing import Optional
 from langevals_core.base_evaluator import (
     BaseEvaluator,
+    EvaluationResult,
     EvaluatorEntry,
     SingleEvaluationResult,
     EvaluationResultSkipped,
 )
 from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
+from pydantic import Field
 
 
 class RagasAnswerCorrectnessEntry(EvaluatorEntry):
-    input: str
+    input: Optional[str] = Field(default="")
     output: str
     expected_output: str
 
 
+class RagasAnswerCorrectnessResult(EvaluationResult):
+    score: float = Field(
+        description="A score between 0.0 and 1.0 indicating the correctness of the answer."
+    )
+
+
 class RagasAnswerCorrectnessEvaluator(
-    BaseEvaluator[RagasAnswerCorrectnessEntry, RagasSettings, RagasResult]
+    BaseEvaluator[RagasAnswerCorrectnessEntry, RagasSettings, RagasAnswerCorrectnessResult]
 ):
     """
-    This evaluator focuses on assessing how pertinent the generated answer is to the given prompt. Higher scores indicate better Correctness.
+    Computes with an LLM a weighted combination of factual as well as semantic similarity between the generated answer and the expected output.
     """
 
     name = "Ragas Answer Correctness"
@@ -30,13 +39,11 @@ class RagasAnswerCorrectnessEvaluator(
     is_guardrail = False
 
     def evaluate(self, entry: RagasAnswerCorrectnessEntry) -> SingleEvaluationResult:
-        content = entry.input or ""
-        if not content:
-            return EvaluationResultSkipped(details="Input is empty")
+        input = entry.input or ""
         return evaluate_ragas(
             evaluator=self,
             metric="answer_correctness",
-            question=entry.input,
+            question=input,
             answer=entry.output,
             ground_truth=entry.expected_output,
             settings=self.settings,
diff --git a/evaluators/ragas/langevals_ragas/answer_relevancy.py b/evaluators/ragas/langevals_ragas/answer_relevancy.py
@@ -1,21 +1,29 @@
 from langevals_core.base_evaluator import (
     BaseEvaluator,
+    EvaluationResult,
     EvaluatorEntry,
     SingleEvaluationResult,
 )
 from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
+from pydantic import Field
 
 
 class RagasAnswerRelevancyEntry(EvaluatorEntry):
     input: str
     output: str
 
 
+class RagasAnswerRelevancyResult(EvaluationResult):
+    score: float = Field(
+        description="A score between 0.0 and 1.0 indicating the relevance of the answer."
+    )
+
+
 class RagasAnswerRelevancyEvaluator(
-    BaseEvaluator[RagasAnswerRelevancyEntry, RagasSettings, RagasResult]
+    BaseEvaluator[RagasAnswerRelevancyEntry, RagasSettings, RagasAnswerRelevancyResult]
 ):
     """
-    This evaluator focuses on assessing how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy.
+    Evaluates how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy.
     """
 
     name = "Ragas Answer Relevancy"
diff --git a/evaluators/ragas/langevals_ragas/lib/common.py b/evaluators/ragas/langevals_ragas/lib/common.py
@@ -51,7 +51,7 @@ class RagasSettings(EvaluatorSettings):
         "azure/gpt-4o-mini",
         "anthropic/claude-3-5-sonnet-20240620",
     ] = Field(
-        default="openai/gpt-3.5-turbo-16k",
+        default="openai/gpt-4o-mini",
         description="The model to use for evaluation.",
     )
     embeddings_model: Literal[
diff --git a/ts-integration/evaluators.generated.ts b/ts-integration/evaluators.generated.ts
@@ -691,9 +691,6 @@ or if it's in a specific expected language.
       },
     },
     result: {
-      score: {
-        description: "How many languages were detected",
-      },
       passed: {
         description:
           "Passes if the detected language on the output matches the detected language on the input, or if the output matches the expected language",
@@ -925,18 +922,18 @@ social security numbers. It allows customization of the detection threshold and
   "ragas/answer_correctness": {
     name: `Ragas Answer Correctness`,
     description: `
-This evaluator focuses on assessing how pertinent the generated answer is to the given prompt. Higher scores indicate better Correctness.
+Computes with an LLM a weighted combination of factual as well as semantic similarity between the generated answer and the expected output.
 `,
     category: "rag",
     docsUrl:
       "https://docs.ragas.io/en/latest/concepts/metrics/answer_correctness.html",
     isGuardrail: false,
-    requiredFields: ["input", "output", "expected_output"],
-    optionalFields: [],
+    requiredFields: ["output", "expected_output"],
+    optionalFields: ["input"],
     settings: {
       model: {
         description: "The model to use for evaluation.",
-        default: "openai/gpt-3.5-turbo-16k",
+        default: "openai/gpt-4o-mini",
       },
       embeddings_model: {
         description: "The model to use for embeddings.",
@@ -948,12 +945,17 @@ This evaluator focuses on assessing how pertinent the generated answer is to the
         default: 2048,
       },
     },
-    result: {},
+    result: {
+      score: {
+        description:
+          "A score between 0.0 and 1.0 indicating the correctness of the answer.",
+      },
+    },
   },
   "ragas/answer_relevancy": {
     name: `Ragas Answer Relevancy`,
     description: `
-This evaluator focuses on assessing how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy.
+Evaluates how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy.
 `,
     category: "rag",
     docsUrl:
@@ -964,7 +966,7 @@ This evaluator focuses on assessing how pertinent the generated answer is to the
     settings: {
       model: {
         description: "The model to use for evaluation.",
-        default: "openai/gpt-3.5-turbo-16k",
+        default: "openai/gpt-4o-mini",
       },
       embeddings_model: {
         description: "The model to use for embeddings.",
@@ -976,7 +978,12 @@ This evaluator focuses on assessing how pertinent the generated answer is to the
         default: 2048,
       },
     },
-    result: {},
+    result: {
+      score: {
+        description:
+          "A score between 0.0 and 1.0 indicating the relevance of the answer.",
+      },
+    },
   },
   "ragas/context_precision": {
     name: `Ragas Context Precision`,
@@ -992,7 +999,7 @@ This metric evaluates whether all of the ground-truth relevant items present in
     settings: {
       model: {
         description: "The model to use for evaluation.",
-        default: "openai/gpt-3.5-turbo-16k",
+        default: "openai/gpt-4o-mini",
       },
       embeddings_model: {
         description: "The model to use for embeddings.",
@@ -1020,7 +1027,7 @@ This evaluator measures the extent to which the retrieved context aligns with th
     settings: {
       model: {
         description: "The model to use for evaluation.",
-        default: "openai/gpt-3.5-turbo-16k",
+        default: "openai/gpt-4o-mini",
       },
       embeddings_model: {
         description: "The model to use for embeddings.",
@@ -1048,7 +1055,7 @@ This metric gauges the relevancy of the retrieved context, calculated based on b
     settings: {
       model: {
         description: "The model to use for evaluation.",
-        default: "openai/gpt-3.5-turbo-16k",
+        default: "openai/gpt-4o-mini",
       },
       embeddings_model: {
         description: "The model to use for embeddings.",
@@ -1076,7 +1083,7 @@ This metric evaluates whether all of the output relevant items present in the co
     settings: {
       model: {
         description: "The model to use for evaluation.",
-        default: "openai/gpt-3.5-turbo-16k",
+        default: "openai/gpt-4o-mini",
       },
       embeddings_model: {
         description: "The model to use for embeddings.",
@@ -1104,7 +1111,7 @@ This evaluator assesses the extent to which the generated answer is consistent w
     settings: {
       model: {
         description: "The model to use for evaluation.",
-        default: "openai/gpt-3.5-turbo-16k",
+        default: "openai/gpt-4o-mini",
       },
       embeddings_model: {
         description: "The model to use for embeddings.",
@@ -1142,8 +1149,8 @@ Allows you to check for simple text matches or regex evaluation.
       },
     },
     result: {
-      score: {
-        description: "Returns 1 if all rules pass, 0 if any rule fails",
+      passed: {
+        description: "True if all rules pass, False if any rule fails",
       },
     },
   },
@@ -1279,9 +1286,6 @@ Use an LLM as a judge with a custom prompt to do a true/false boolean evaluation
       },
     },
     result: {
-      score: {
-        description: "Returns 1 if LLM evaluates it as true, 0 if as false",
-      },
       passed: {
         description: "The veredict given by the LLM",
       },
@@ -1575,6 +1579,9 @@ including harassment, hate speech, self-harm, sexual content, and violence.
         description:
           "The model's confidence on primary category where the input violates the OpenAI's policy. The value is between 0 and 1, where higher values denote higher confidence.",
       },
+      passed: {
+        description: "Fails if any moderation category is flagged",
+      },
     },
   },
   "example/word_count": {

Original file line number	Diff line number	Diff line change
`@@ -45,6 +45,10 @@ class OpenAIModerationSettings(EvaluatorSettings):`
`45`	`45`
`46`	`46`
`47`	`47`	`class OpenAIModerationResult(EvaluationResult):`
	`48`	`+ passed: Optional[bool] = Field(`
	`49`	`+ description="Fails if any moderation category is flagged",`
	`50`	`+ default=None,`
	`51`	`+ )`
`48`	`52`	`score: float = Field(`
`49`	`53`	`description="The model's confidence on primary category where the input violates the OpenAI's policy. The value is between 0 and 1, where higher values denote higher confidence."`
`50`	`54`	`)`
Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,7 @@ class RagasSettings(EvaluatorSettings):`
`51`	`51`	`"azure/gpt-4o-mini",`
`52`	`52`	`"anthropic/claude-3-5-sonnet-20240620",`
`53`	`53`	`] = Field(`
`54`		`- default="openai/gpt-3.5-turbo-16k",`
	`54`	`+ default="openai/gpt-4o-mini",`
`55`	`55`	`description="The model to use for evaluation.",`
`56`	`56`	`)`
`57`	`57`	`embeddings_model: Literal[`