Skip to content

Commit be2f3e9

Browse files
committed
Add score to all ragas metrics
1 parent e5c8880 commit be2f3e9

File tree

11 files changed

+90
-26
lines changed

11 files changed

+90
-26
lines changed

evaluators/ragas/langevals_ragas/answer_correctness.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,8 @@
44
EvaluationResult,
55
EvaluatorEntry,
66
SingleEvaluationResult,
7-
EvaluationResultSkipped,
87
)
9-
from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
8+
from .lib.common import env_vars, evaluate_ragas, RagasSettings
109
from pydantic import Field
1110

1211

@@ -18,12 +17,15 @@ class RagasAnswerCorrectnessEntry(EvaluatorEntry):
1817

1918
class RagasAnswerCorrectnessResult(EvaluationResult):
2019
score: float = Field(
21-
description="A score between 0.0 and 1.0 indicating the correctness of the answer."
20+
default=0.0,
21+
description="A score between 0.0 and 1.0 indicating the correctness of the answer.",
2222
)
2323

2424

2525
class RagasAnswerCorrectnessEvaluator(
26-
BaseEvaluator[RagasAnswerCorrectnessEntry, RagasSettings, RagasAnswerCorrectnessResult]
26+
BaseEvaluator[
27+
RagasAnswerCorrectnessEntry, RagasSettings, RagasAnswerCorrectnessResult
28+
]
2729
):
2830
"""
2931
Computes with an LLM a weighted combination of factual as well as semantic similarity between the generated answer and the expected output.

evaluators/ragas/langevals_ragas/answer_relevancy.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
EvaluatorEntry,
55
SingleEvaluationResult,
66
)
7-
from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
7+
from .lib.common import env_vars, evaluate_ragas, RagasSettings
88
from pydantic import Field
99

1010

@@ -15,7 +15,8 @@ class RagasAnswerRelevancyEntry(EvaluatorEntry):
1515

1616
class RagasAnswerRelevancyResult(EvaluationResult):
1717
score: float = Field(
18-
description="A score between 0.0 and 1.0 indicating the relevance of the answer."
18+
default=0.0,
19+
description="A score between 0.0 and 1.0 indicating the relevance of the answer.",
1920
)
2021

2122

evaluators/ragas/langevals_ragas/context_precision.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
from langevals_core.base_evaluator import (
22
BaseEvaluator,
3+
EvaluationResult,
34
EvaluatorEntry,
45
SingleEvaluationResult,
56
)
6-
from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
7+
from .lib.common import env_vars, evaluate_ragas, RagasSettings
8+
from pydantic import Field
79

810

911
class RagasContextPrecisionEntry(EvaluatorEntry):
@@ -12,8 +14,17 @@ class RagasContextPrecisionEntry(EvaluatorEntry):
1214
expected_output: str
1315

1416

17+
class RagasContextPrecisionResult(EvaluationResult):
18+
score: float = Field(
19+
default=0.0,
20+
description="A score between 0.0 and 1.0 indicating the precision of the context."
21+
)
22+
23+
1524
class RagasContextPrecisionEvaluator(
16-
BaseEvaluator[RagasContextPrecisionEntry, RagasSettings, RagasResult]
25+
BaseEvaluator[
26+
RagasContextPrecisionEntry, RagasSettings, RagasContextPrecisionResult
27+
]
1728
):
1829
"""
1930
This metric evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Higher scores indicate better precision.

evaluators/ragas/langevals_ragas/context_recall.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,27 @@
11
from langevals_core.base_evaluator import (
22
BaseEvaluator,
3+
EvaluationResult,
34
EvaluatorEntry,
45
SingleEvaluationResult,
56
)
6-
from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
7+
from .lib.common import env_vars, evaluate_ragas, RagasSettings
8+
from pydantic import Field
79

810

911
class RagasContextRecallEntry(EvaluatorEntry):
1012
contexts: list[str]
1113
expected_output: str
1214

1315

16+
class RagasContextRecallResult(EvaluationResult):
17+
score: float = Field(
18+
default=0.0,
19+
description="A score between 0.0 and 1.0 indicating the recall of the context.",
20+
)
21+
22+
1423
class RagasContextRecallEvaluator(
15-
BaseEvaluator[RagasContextRecallEntry, RagasSettings, RagasResult]
24+
BaseEvaluator[RagasContextRecallEntry, RagasSettings, RagasContextRecallResult]
1625
):
1726
"""
1827
This evaluator measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. Higher values indicate better performance.

evaluators/ragas/langevals_ragas/context_relevancy.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,29 @@
11
from langevals_core.base_evaluator import (
22
BaseEvaluator,
3+
EvaluationResult,
34
EvaluatorEntry,
45
SingleEvaluationResult,
56
)
6-
from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
7+
from .lib.common import env_vars, evaluate_ragas, RagasSettings
8+
from pydantic import Field
79

810

911
class RagasContextRelevancyEntry(EvaluatorEntry):
1012
output: str
1113
contexts: list[str]
1214

1315

16+
class RagasContextRelevancyResult(EvaluationResult):
17+
score: float = Field(
18+
default=0.0,
19+
description="A score between 0.0 and 1.0 indicating the relevancy of the context.",
20+
)
21+
22+
1423
class RagasContextRelevancyEvaluator(
15-
BaseEvaluator[RagasContextRelevancyEntry, RagasSettings, RagasResult]
24+
BaseEvaluator[
25+
RagasContextRelevancyEntry, RagasSettings, RagasContextRelevancyResult
26+
]
1627
):
1728
"""
1829
This metric gauges the relevancy of the retrieved context, calculated based on both the question and contexts. The values fall within the range of (0, 1), with higher values indicating better relevancy.

evaluators/ragas/langevals_ragas/context_utilization.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
from langevals_core.base_evaluator import (
22
BaseEvaluator,
3+
EvaluationResult,
34
EvaluatorEntry,
45
SingleEvaluationResult,
56
)
6-
from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
7+
from .lib.common import env_vars, evaluate_ragas, RagasSettings
8+
from pydantic import Field
79

810

911
class RagasContextUtilizationEntry(EvaluatorEntry):
@@ -12,8 +14,17 @@ class RagasContextUtilizationEntry(EvaluatorEntry):
1214
contexts: list[str]
1315

1416

17+
class RagasContextUtilizationResult(EvaluationResult):
18+
score: float = Field(
19+
default=0.0,
20+
description="A score between 0.0 and 1.0 indicating the utilization of the context.",
21+
)
22+
23+
1524
class RagasContextUtilizationEvaluator(
16-
BaseEvaluator[RagasContextUtilizationEntry, RagasSettings, RagasResult]
25+
BaseEvaluator[
26+
RagasContextUtilizationEntry, RagasSettings, RagasContextUtilizationResult
27+
]
1728
):
1829
"""
1930
This metric evaluates whether all of the output relevant items present in the contexts are ranked higher or not. Higher scores indicate better utilization.

evaluators/ragas/langevals_ragas/faithfulness.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ class RagasFaithfulnessEntry(EvaluatorEntry):
2121

2222
class RagasFaithfulnessResult(EvaluationResult):
2323
score: float = Field(
24-
description="A score between 0.0 and 1.0 indicating the faithfulness of the answer."
24+
default=0.0,
25+
description="A score between 0.0 and 1.0 indicating the faithfulness of the answer.",
2526
)
2627

2728

poetry.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ lingua = [ "langevals-lingua",]
5757
aws = [ "langevals-aws",]
5858
huggingface = [ "langevals-huggingface",]
5959
langevals = [ "langevals-langevals",]
60-
all = [ "langevals-langevals", "langevals-azure", "langevals-example", "langevals-lingua", "langevals-ragas", "langevals-google_cloud", "langevals-haystack", "langevals-presidio", "langevals-aws", "langevals-huggingface", "langevals-openai",]
60+
all = [ "langevals-azure", "langevals-langevals", "langevals-example", "langevals-lingua", "langevals-ragas", "langevals-google_cloud", "langevals-haystack", "langevals-presidio", "langevals-aws", "langevals-huggingface", "langevals-openai",]
6161
haystack = [ "langevals-haystack",]
6262
presidio = [ "langevals-presidio",]
6363

scripts/generate_evaluator_dependencies.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,9 @@
2020
package_names = []
2121
for package in evaluator_packages:
2222
package_name = f"langevals-{package}"
23-
optional = "false" if package == "langevals" else "true"
24-
generated_dependencies += f'{package_name} = {{ path = "evaluators/{package}", develop = true, optional = {optional} }}\n'
25-
if package != "langevals":
26-
package_names.append(package_name)
27-
generated_extras += f'{package} = ["{package_name}"]\n'
23+
generated_dependencies += f'{package_name} = {{ path = "evaluators/{package}", develop = true, optional = true }}\n'
24+
package_names.append(package_name)
25+
generated_extras += f'{package} = ["{package_name}"]\n'
2826

2927
generated_extras += 'all = ["' + '", "'.join(package_names) + '"]'
3028

0 commit comments

Comments
 (0)