Skip to content

Commit 98c8524

Browse files
committed
Implement all the essential ragas RAG metrics, with a nice split and organization between the traditional precision/recall metrics and the ones based on the output calculated by the LLMs
1 parent 7a0f5ad commit 98c8524

18 files changed

+832
-391
lines changed

evaluators/ragas/langevals_ragas/answer_correctness.py

Lines changed: 0 additions & 52 deletions
This file was deleted.
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
from typing import Literal
2+
from langevals_core.base_evaluator import (
3+
BaseEvaluator,
4+
EvaluationResult,
5+
EvaluatorEntry,
6+
SingleEvaluationResult,
7+
)
8+
from ragas import SingleTurnSample
9+
from .lib.common import (
10+
RagasResult,
11+
env_vars,
12+
RagasSettings,
13+
)
14+
from pydantic import Field
15+
from ragas.metrics import (
16+
NonLLMContextRecall,
17+
NonLLMContextPrecisionWithReference,
18+
NonLLMStringSimilarity,
19+
DistanceMeasure,
20+
)
21+
22+
23+
class RagasContextF1Entry(EvaluatorEntry):
24+
contexts: list[str]
25+
expected_contexts: list[str]
26+
27+
28+
class RagasContextF1Result(EvaluationResult):
29+
score: float = Field(
30+
default=0.0,
31+
description="A score between 0.0 and 1.0 indicating the F1 score.",
32+
)
33+
34+
35+
class RagasContextF1Settings(RagasSettings):
36+
distance_measure: Literal["levenshtein", "hamming", "jaro", "jaro_winkler"] = (
37+
"levenshtein"
38+
)
39+
40+
41+
class RagasContextF1Evaluator(
42+
BaseEvaluator[
43+
RagasContextF1Entry,
44+
RagasContextF1Settings,
45+
RagasContextF1Result,
46+
]
47+
):
48+
"""
49+
Balances between precision and recall for context retrieval, increasing it means a better signal-to-noise ratio. Uses traditional string distance metrics.
50+
"""
51+
52+
name = "Context F1"
53+
category = "rag"
54+
env_vars = env_vars
55+
default_settings = RagasContextF1Settings()
56+
docs_url = "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_F1/#non-llm-based-context-F1"
57+
is_guardrail = False
58+
59+
def evaluate(self, entry: RagasContextF1Entry) -> SingleEvaluationResult:
60+
precision_scorer = NonLLMContextPrecisionWithReference(
61+
distance_measure=NonLLMStringSimilarity(
62+
distance_measure={
63+
"levenshtein": DistanceMeasure.LEVENSHTEIN,
64+
"hamming": DistanceMeasure.HAMMING,
65+
"jaro": DistanceMeasure.JARO,
66+
"jaro_winkler": DistanceMeasure.JARO_WINKLER,
67+
}[self.settings.distance_measure]
68+
)
69+
)
70+
71+
precision_score = precision_scorer.single_turn_score(
72+
SingleTurnSample(
73+
retrieved_contexts=entry.contexts,
74+
reference_contexts=entry.expected_contexts,
75+
)
76+
)
77+
78+
recall_scorer = NonLLMContextRecall()
79+
recall_scorer.distance_measure = {
80+
"levenshtein": DistanceMeasure.LEVENSHTEIN,
81+
"hamming": DistanceMeasure.HAMMING,
82+
"jaro": DistanceMeasure.JARO,
83+
"jaro_winkler": DistanceMeasure.JARO_WINKLER,
84+
}[self.settings.distance_measure]
85+
86+
recall_score = recall_scorer.single_turn_score(
87+
SingleTurnSample(
88+
retrieved_contexts=entry.contexts,
89+
reference_contexts=entry.expected_contexts,
90+
)
91+
)
92+
93+
f1_score = (
94+
2 * (precision_score * recall_score) / (precision_score + recall_score)
95+
if (precision_score + recall_score) != 0
96+
else 0
97+
)
98+
99+
return RagasResult(
100+
score=f1_score,
101+
cost=None,
102+
details=f"Precision: {precision_score}, Recall: {recall_score}",
103+
)
Lines changed: 49 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,81 @@
1+
from typing import Literal
12
from langevals_core.base_evaluator import (
23
BaseEvaluator,
34
EvaluationResult,
45
EvaluatorEntry,
56
SingleEvaluationResult,
67
)
7-
from .lib.common import env_vars, evaluate_ragas, RagasSettings
8+
from ragas import SingleTurnSample
9+
from .lib.common import (
10+
RagasResult,
11+
env_vars,
12+
RagasSettings,
13+
)
814
from pydantic import Field
15+
from ragas.metrics import (
16+
NonLLMContextPrecisionWithReference,
17+
NonLLMStringSimilarity,
18+
DistanceMeasure,
19+
)
920

1021

1122
class RagasContextPrecisionEntry(EvaluatorEntry):
12-
input: str
1323
contexts: list[str]
14-
expected_output: str
24+
expected_contexts: list[str]
1525

1626

1727
class RagasContextPrecisionResult(EvaluationResult):
1828
score: float = Field(
1929
default=0.0,
20-
description="A score between 0.0 and 1.0 indicating the precision of the context."
30+
description="A score between 0.0 and 1.0 indicating the precision score.",
31+
)
32+
33+
34+
class RagasContextPrecisionSettings(RagasSettings):
35+
distance_measure: Literal["levenshtein", "hamming", "jaro", "jaro_winkler"] = (
36+
"levenshtein"
2137
)
2238

2339

2440
class RagasContextPrecisionEvaluator(
2541
BaseEvaluator[
26-
RagasContextPrecisionEntry, RagasSettings, RagasContextPrecisionResult
42+
RagasContextPrecisionEntry,
43+
RagasContextPrecisionSettings,
44+
RagasContextPrecisionResult,
2745
]
2846
):
2947
"""
30-
This metric evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Higher scores indicate better precision.
48+
Measures how accurate is the retrieval compared to expected contexts, increasing it means less noise in the retrieval. Uses traditional string distance metrics.
3149
"""
3250

33-
name = "Ragas Context Precision"
51+
name = "Context Precision"
3452
category = "rag"
3553
env_vars = env_vars
36-
default_settings = RagasSettings()
37-
docs_url = "https://docs.ragas.io/en/latest/concepts/metrics/context_precision.html"
54+
default_settings = RagasContextPrecisionSettings()
55+
docs_url = "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_precision/#non-llm-based-context-precision"
3856
is_guardrail = False
3957

4058
def evaluate(self, entry: RagasContextPrecisionEntry) -> SingleEvaluationResult:
41-
return evaluate_ragas(
42-
evaluator=self,
43-
metric="context_precision",
44-
user_input=entry.input,
45-
retrieved_contexts=entry.contexts,
46-
reference=entry.expected_output,
47-
settings=self.settings,
59+
scorer = NonLLMContextPrecisionWithReference(
60+
distance_measure=NonLLMStringSimilarity(
61+
distance_measure={
62+
"levenshtein": DistanceMeasure.LEVENSHTEIN,
63+
"hamming": DistanceMeasure.HAMMING,
64+
"jaro": DistanceMeasure.JARO,
65+
"jaro_winkler": DistanceMeasure.JARO_WINKLER,
66+
}[self.settings.distance_measure]
67+
)
68+
)
69+
70+
score = scorer.single_turn_score(
71+
SingleTurnSample(
72+
retrieved_contexts=entry.contexts,
73+
reference_contexts=entry.expected_contexts,
74+
)
75+
)
76+
77+
return RagasResult(
78+
score=score,
79+
cost=None,
80+
details=None,
4881
)
Lines changed: 47 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,77 @@
1+
from typing import Literal
12
from langevals_core.base_evaluator import (
23
BaseEvaluator,
34
EvaluationResult,
45
EvaluatorEntry,
56
SingleEvaluationResult,
67
)
7-
from .lib.common import env_vars, evaluate_ragas, RagasSettings
8+
from ragas import SingleTurnSample
9+
from .lib.common import (
10+
RagasResult,
11+
env_vars,
12+
RagasSettings,
13+
)
814
from pydantic import Field
15+
from ragas.metrics import (
16+
NonLLMContextRecall,
17+
DistanceMeasure,
18+
)
919

1020

1121
class RagasContextRecallEntry(EvaluatorEntry):
12-
input: str
1322
contexts: list[str]
14-
expected_output: str
23+
expected_contexts: list[str]
1524

1625

1726
class RagasContextRecallResult(EvaluationResult):
1827
score: float = Field(
1928
default=0.0,
20-
description="A score between 0.0 and 1.0 indicating the recall of the context.",
29+
description="A score between 0.0 and 1.0 indicating the Recall score.",
30+
)
31+
32+
33+
class RagasContextRecallSettings(RagasSettings):
34+
distance_measure: Literal["levenshtein", "hamming", "jaro", "jaro_winkler"] = (
35+
"levenshtein"
2136
)
2237

2338

2439
class RagasContextRecallEvaluator(
25-
BaseEvaluator[RagasContextRecallEntry, RagasSettings, RagasContextRecallResult]
40+
BaseEvaluator[
41+
RagasContextRecallEntry,
42+
RagasContextRecallSettings,
43+
RagasContextRecallResult,
44+
]
2645
):
2746
"""
28-
This evaluator measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. Higher values indicate better performance.
47+
Measures how many relevant contexts were retrieved compared to expected contexts, increasing it means more signal in the retrieval. Uses traditional string distance metrics.
2948
"""
3049

31-
name = "Ragas Context Recall"
50+
name = "Context Recall"
3251
category = "rag"
3352
env_vars = env_vars
34-
default_settings = RagasSettings()
35-
docs_url = "https://docs.ragas.io/en/latest/concepts/metrics/context_recall.html"
53+
default_settings = RagasContextRecallSettings()
54+
docs_url = "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_recall/#non-llm-based-context-recall"
3655
is_guardrail = False
3756

3857
def evaluate(self, entry: RagasContextRecallEntry) -> SingleEvaluationResult:
39-
input = entry.input or ""
40-
return evaluate_ragas(
41-
evaluator=self,
42-
metric="context_recall",
43-
user_input=input,
44-
retrieved_contexts=entry.contexts,
45-
reference=entry.expected_output,
46-
settings=self.settings,
58+
scorer = NonLLMContextRecall()
59+
scorer.distance_measure = {
60+
"levenshtein": DistanceMeasure.LEVENSHTEIN,
61+
"hamming": DistanceMeasure.HAMMING,
62+
"jaro": DistanceMeasure.JARO,
63+
"jaro_winkler": DistanceMeasure.JARO_WINKLER,
64+
}[self.settings.distance_measure]
65+
66+
score = scorer.single_turn_score(
67+
SingleTurnSample(
68+
retrieved_contexts=entry.contexts,
69+
reference_contexts=entry.expected_contexts,
70+
)
71+
)
72+
73+
return RagasResult(
74+
score=score,
75+
cost=None,
76+
details=None,
4777
)

evaluators/ragas/langevals_ragas/context_relevancy.py

Lines changed: 0 additions & 46 deletions
This file was deleted.

0 commit comments

Comments
 (0)