Add score to all ragas metrics

rogeriochaves · rogeriochaves · commit be2f3e9aa70a · 2024-11-22T23:25:45.000+01:00
diff --git a/evaluators/ragas/langevals_ragas/answer_correctness.py b/evaluators/ragas/langevals_ragas/answer_correctness.py
@@ -4,9 +4,8 @@
     EvaluationResult,
     EvaluatorEntry,
     SingleEvaluationResult,
-    EvaluationResultSkipped,
 )
-from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
+from .lib.common import env_vars, evaluate_ragas, RagasSettings
 from pydantic import Field
 
 
@@ -18,12 +17,15 @@ class RagasAnswerCorrectnessEntry(EvaluatorEntry):
 
 class RagasAnswerCorrectnessResult(EvaluationResult):
     score: float = Field(
-        description="A score between 0.0 and 1.0 indicating the correctness of the answer."
+        default=0.0,
+        description="A score between 0.0 and 1.0 indicating the correctness of the answer.",
     )
 
 
 class RagasAnswerCorrectnessEvaluator(
-    BaseEvaluator[RagasAnswerCorrectnessEntry, RagasSettings, RagasAnswerCorrectnessResult]
+    BaseEvaluator[
+        RagasAnswerCorrectnessEntry, RagasSettings, RagasAnswerCorrectnessResult
+    ]
 ):
     """
     Computes with an LLM a weighted combination of factual as well as semantic similarity between the generated answer and the expected output.
diff --git a/evaluators/ragas/langevals_ragas/answer_relevancy.py b/evaluators/ragas/langevals_ragas/answer_relevancy.py
@@ -4,7 +4,7 @@
     EvaluatorEntry,
     SingleEvaluationResult,
 )
-from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
+from .lib.common import env_vars, evaluate_ragas, RagasSettings
 from pydantic import Field
 
 
@@ -15,7 +15,8 @@ class RagasAnswerRelevancyEntry(EvaluatorEntry):
 
 class RagasAnswerRelevancyResult(EvaluationResult):
     score: float = Field(
-        description="A score between 0.0 and 1.0 indicating the relevance of the answer."
+        default=0.0,
+        description="A score between 0.0 and 1.0 indicating the relevance of the answer.",
     )
 
 
diff --git a/evaluators/ragas/langevals_ragas/context_precision.py b/evaluators/ragas/langevals_ragas/context_precision.py
@@ -1,9 +1,11 @@
 from langevals_core.base_evaluator import (
     BaseEvaluator,
+    EvaluationResult,
     EvaluatorEntry,
     SingleEvaluationResult,
 )
-from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
+from .lib.common import env_vars, evaluate_ragas, RagasSettings
+from pydantic import Field
 
 
 class RagasContextPrecisionEntry(EvaluatorEntry):
@@ -12,8 +14,17 @@ class RagasContextPrecisionEntry(EvaluatorEntry):
     expected_output: str
 
 
+class RagasContextPrecisionResult(EvaluationResult):
+    score: float = Field(
+        default=0.0,
+        description="A score between 0.0 and 1.0 indicating the precision of the context."
+    )
+
+
 class RagasContextPrecisionEvaluator(
-    BaseEvaluator[RagasContextPrecisionEntry, RagasSettings, RagasResult]
+    BaseEvaluator[
+        RagasContextPrecisionEntry, RagasSettings, RagasContextPrecisionResult
+    ]
 ):
     """
     This metric evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Higher scores indicate better precision.
diff --git a/evaluators/ragas/langevals_ragas/context_recall.py b/evaluators/ragas/langevals_ragas/context_recall.py
@@ -1,18 +1,27 @@
 from langevals_core.base_evaluator import (
     BaseEvaluator,
+    EvaluationResult,
     EvaluatorEntry,
     SingleEvaluationResult,
 )
-from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
+from .lib.common import env_vars, evaluate_ragas, RagasSettings
+from pydantic import Field
 
 
 class RagasContextRecallEntry(EvaluatorEntry):
     contexts: list[str]
     expected_output: str
 
 
+class RagasContextRecallResult(EvaluationResult):
+    score: float = Field(
+        default=0.0,
+        description="A score between 0.0 and 1.0 indicating the recall of the context.",
+    )
+
+
 class RagasContextRecallEvaluator(
-    BaseEvaluator[RagasContextRecallEntry, RagasSettings, RagasResult]
+    BaseEvaluator[RagasContextRecallEntry, RagasSettings, RagasContextRecallResult]
 ):
     """
     This evaluator measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. Higher values indicate better performance.
diff --git a/evaluators/ragas/langevals_ragas/context_relevancy.py b/evaluators/ragas/langevals_ragas/context_relevancy.py
@@ -1,18 +1,29 @@
 from langevals_core.base_evaluator import (
     BaseEvaluator,
+    EvaluationResult,
     EvaluatorEntry,
     SingleEvaluationResult,
 )
-from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
+from .lib.common import env_vars, evaluate_ragas, RagasSettings
+from pydantic import Field
 
 
 class RagasContextRelevancyEntry(EvaluatorEntry):
     output: str
     contexts: list[str]
 
 
+class RagasContextRelevancyResult(EvaluationResult):
+    score: float = Field(
+        default=0.0,
+        description="A score between 0.0 and 1.0 indicating the relevancy of the context.",
+    )
+
+
 class RagasContextRelevancyEvaluator(
-    BaseEvaluator[RagasContextRelevancyEntry, RagasSettings, RagasResult]
+    BaseEvaluator[
+        RagasContextRelevancyEntry, RagasSettings, RagasContextRelevancyResult
+    ]
 ):
     """
     This metric gauges the relevancy of the retrieved context, calculated based on both the question and contexts. The values fall within the range of (0, 1), with higher values indicating better relevancy.
diff --git a/evaluators/ragas/langevals_ragas/context_utilization.py b/evaluators/ragas/langevals_ragas/context_utilization.py
@@ -1,9 +1,11 @@
 from langevals_core.base_evaluator import (
     BaseEvaluator,
+    EvaluationResult,
     EvaluatorEntry,
     SingleEvaluationResult,
 )
-from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
+from .lib.common import env_vars, evaluate_ragas, RagasSettings
+from pydantic import Field
 
 
 class RagasContextUtilizationEntry(EvaluatorEntry):
@@ -12,8 +14,17 @@ class RagasContextUtilizationEntry(EvaluatorEntry):
     contexts: list[str]
 
 
+class RagasContextUtilizationResult(EvaluationResult):
+    score: float = Field(
+        default=0.0,
+        description="A score between 0.0 and 1.0 indicating the utilization of the context.",
+    )
+
+
 class RagasContextUtilizationEvaluator(
-    BaseEvaluator[RagasContextUtilizationEntry, RagasSettings, RagasResult]
+    BaseEvaluator[
+        RagasContextUtilizationEntry, RagasSettings, RagasContextUtilizationResult
+    ]
 ):
     """
     This metric evaluates whether all of the output relevant items present in the contexts are ranked higher or not. Higher scores indicate better utilization.
diff --git a/evaluators/ragas/langevals_ragas/faithfulness.py b/evaluators/ragas/langevals_ragas/faithfulness.py
@@ -21,7 +21,8 @@ class RagasFaithfulnessEntry(EvaluatorEntry):
 
 class RagasFaithfulnessResult(EvaluationResult):
     score: float = Field(
-        description="A score between 0.0 and 1.0 indicating the faithfulness of the answer."
+        default=0.0,
+        description="A score between 0.0 and 1.0 indicating the faithfulness of the answer.",
     )
 
 
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -57,7 +57,7 @@ lingua = [ "langevals-lingua",]
 aws = [ "langevals-aws",]
 huggingface = [ "langevals-huggingface",]
 langevals = [ "langevals-langevals",]
-all = [ "langevals-langevals", "langevals-azure", "langevals-example", "langevals-lingua", "langevals-ragas", "langevals-google_cloud", "langevals-haystack", "langevals-presidio", "langevals-aws", "langevals-huggingface", "langevals-openai",]
+all = [ "langevals-azure", "langevals-langevals", "langevals-example", "langevals-lingua", "langevals-ragas", "langevals-google_cloud", "langevals-haystack", "langevals-presidio", "langevals-aws", "langevals-huggingface", "langevals-openai",]
 haystack = [ "langevals-haystack",]
 presidio = [ "langevals-presidio",]
 
diff --git a/scripts/generate_evaluator_dependencies.py b/scripts/generate_evaluator_dependencies.py
@@ -20,11 +20,9 @@
 package_names = []
 for package in evaluator_packages:
     package_name = f"langevals-{package}"
-    optional = "false" if package == "langevals" else "true"
-    generated_dependencies += f'{package_name} = {{ path = "evaluators/{package}", develop = true, optional = {optional} }}\n'
-    if package != "langevals":
-        package_names.append(package_name)
-        generated_extras += f'{package} = ["{package_name}"]\n'
+    generated_dependencies += f'{package_name} = {{ path = "evaluators/{package}", develop = true, optional = true }}\n'
+    package_names.append(package_name)
+    generated_extras += f'{package} = ["{package_name}"]\n'
 
 generated_extras += 'all = ["' + '", "'.join(package_names) + '"]'
 
diff --git a/ts-integration/evaluators.generated.ts b/ts-integration/evaluators.generated.ts
@@ -1382,7 +1382,12 @@ This metric evaluates whether all of the ground-truth relevant items present in
         default: 2048,
       },
     },
-    result: {},
+    result: {
+      score: {
+        description:
+          "A score between 0.0 and 1.0 indicating the precision of the context.",
+      },
+    },
   },
   "ragas/context_recall": {
     name: `Ragas Context Recall`,
@@ -1410,7 +1415,12 @@ This evaluator measures the extent to which the retrieved context aligns with th
         default: 2048,
       },
     },
-    result: {},
+    result: {
+      score: {
+        description:
+          "A score between 0.0 and 1.0 indicating the recall of the context.",
+      },
+    },
   },
   "ragas/context_relevancy": {
     name: `Ragas Context Relevancy`,
@@ -1438,7 +1448,12 @@ This metric gauges the relevancy of the retrieved context, calculated based on b
         default: 2048,
       },
     },
-    result: {},
+    result: {
+      score: {
+        description:
+          "A score between 0.0 and 1.0 indicating the relevancy of the context.",
+      },
+    },
   },
   "ragas/context_utilization": {
     name: `Ragas Context Utilization`,
@@ -1466,7 +1481,12 @@ This metric evaluates whether all of the output relevant items present in the co
         default: 2048,
       },
     },
-    result: {},
+    result: {
+      score: {
+        description:
+          "A score between 0.0 and 1.0 indicating the utilization of the context.",
+      },
+    },
   },
   "ragas/faithfulness": {
     name: `Ragas Faithfulness`,

Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@`
`4`	`4`	`EvaluatorEntry,`
`5`	`5`	`SingleEvaluationResult,`
`6`	`6`	`)`
`7`		`-from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult`
	`7`	`+from .lib.common import env_vars, evaluate_ragas, RagasSettings`
`8`	`8`	`from pydantic import Field`
`9`	`9`
`10`	`10`
`@@ -15,7 +15,8 @@ class RagasAnswerRelevancyEntry(EvaluatorEntry):`
`15`	`15`
`16`	`16`	`class RagasAnswerRelevancyResult(EvaluationResult):`
`17`	`17`	`score: float = Field(`
`18`		`- description="A score between 0.0 and 1.0 indicating the relevance of the answer."`
	`18`	`+ default=0.0,`
	`19`	`+ description="A score between 0.0 and 1.0 indicating the relevance of the answer.",`
`19`	`20`	`)`
`20`	`21`
`21`	`22`
Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,8 @@ class RagasFaithfulnessEntry(EvaluatorEntry):`
`21`	`21`
`22`	`22`	`class RagasFaithfulnessResult(EvaluationResult):`
`23`	`23`	`score: float = Field(`
`24`		`- description="A score between 0.0 and 1.0 indicating the faithfulness of the answer."`
	`24`	`+ default=0.0,`
	`25`	`+ description="A score between 0.0 and 1.0 indicating the faithfulness of the answer.",`
`25`	`26`	`)`
`26`	`27`
`27`	`28`