Skip to content

Commit 9562047

Browse files
committed
Accept empty contexts and expected_contexts for f1/precision/recall, with early returns to prevent ragas from blowing up
1 parent f4c187e commit 9562047

File tree

4 files changed

+172
-18
lines changed

4 files changed

+172
-18
lines changed

evaluators/ragas/langevals_ragas/context_f1.py

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -69,27 +69,39 @@ def evaluate(self, entry: RagasContextF1Entry) -> SingleEvaluationResult:
6969
)
7070
)
7171

72-
precision_score = precision_scorer.single_turn_score(
73-
SingleTurnSample(
74-
retrieved_contexts=entry.contexts,
75-
reference_contexts=entry.expected_contexts,
72+
if len(entry.expected_contexts) == 0 and len(entry.contexts) == 0:
73+
precision_score = 1.0
74+
elif len(entry.expected_contexts) == 0 or len(entry.contexts) == 0:
75+
precision_score = 0.0
76+
else:
77+
precision_score = precision_scorer.single_turn_score(
78+
SingleTurnSample(
79+
retrieved_contexts=entry.contexts,
80+
reference_contexts=entry.expected_contexts,
81+
)
7682
)
77-
)
7883

79-
recall_scorer = NonLLMContextRecall()
80-
recall_scorer.distance_measure = {
81-
"levenshtein": DistanceMeasure.LEVENSHTEIN,
82-
"hamming": DistanceMeasure.HAMMING,
83-
"jaro": DistanceMeasure.JARO,
84-
"jaro_winkler": DistanceMeasure.JARO_WINKLER,
85-
}[self.settings.distance_measure]
84+
if len(entry.expected_contexts) == 0 and len(entry.contexts) == 0:
85+
recall_score = 1.0
86+
elif len(entry.expected_contexts) == 0:
87+
recall_score = 1.0
88+
elif len(entry.contexts) == 0:
89+
recall_score = 0.0
90+
else:
91+
recall_scorer = NonLLMContextRecall()
92+
recall_scorer.distance_measure = {
93+
"levenshtein": DistanceMeasure.LEVENSHTEIN,
94+
"hamming": DistanceMeasure.HAMMING,
95+
"jaro": DistanceMeasure.JARO,
96+
"jaro_winkler": DistanceMeasure.JARO_WINKLER,
97+
}[self.settings.distance_measure]
8698

87-
recall_score = recall_scorer.single_turn_score(
88-
SingleTurnSample(
89-
retrieved_contexts=entry.contexts,
90-
reference_contexts=entry.expected_contexts,
99+
recall_score = recall_scorer.single_turn_score(
100+
SingleTurnSample(
101+
retrieved_contexts=entry.contexts,
102+
reference_contexts=entry.expected_contexts,
103+
)
91104
)
92-
)
93105

94106
f1_score = (
95107
2 * (precision_score * recall_score) / (precision_score + recall_score)

evaluators/ragas/langevals_ragas/context_precision.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,24 @@ class RagasContextPrecisionEvaluator(
5656
is_guardrail = False
5757

5858
def evaluate(self, entry: RagasContextPrecisionEntry) -> SingleEvaluationResult:
59+
if len(entry.expected_contexts) == 0 and len(entry.contexts) == 0:
60+
return RagasResult(
61+
score=1.0,
62+
cost=None,
63+
details="No contexts retrieved, but also no contexts expected, so that's a perfect precision of 1",
64+
)
65+
if len(entry.expected_contexts) == 0:
66+
return RagasResult(
67+
score=0.0,
68+
cost=None,
69+
details="No contexts expected, yet some were retrieved, precision is 0",
70+
)
71+
if len(entry.contexts) == 0:
72+
return RagasResult(
73+
score=0.0,
74+
cost=None,
75+
details="No contexts retrieved, precision is 0",
76+
)
5977
scorer = NonLLMContextPrecisionWithReference(
6078
distance_measure=NonLLMStringSimilarity(
6179
distance_measure={

evaluators/ragas/langevals_ragas/context_recall.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
EvaluatorEntry,
66
SingleEvaluationResult,
77
EvaluatorSettings,
8+
EvaluationResultSkipped,
89
)
910
from ragas import SingleTurnSample
1011
from .lib.common import (
@@ -55,6 +56,25 @@ class RagasContextRecallEvaluator(
5556
is_guardrail = False
5657

5758
def evaluate(self, entry: RagasContextRecallEntry) -> SingleEvaluationResult:
59+
if len(entry.expected_contexts) == 0 and len(entry.contexts) == 0:
60+
return RagasResult(
61+
score=1.0,
62+
cost=None,
63+
details="No contexts retrieved, but also no contexts expected, so that's a perfect recall of 1",
64+
)
65+
if len(entry.expected_contexts) == 0:
66+
return RagasResult(
67+
score=1.0,
68+
cost=None,
69+
details="No contexts expected, meaning nothing was missing, so that's a perfect recall of 1",
70+
)
71+
if len(entry.contexts) == 0:
72+
return RagasResult(
73+
score=0.0,
74+
cost=None,
75+
details="No contexts retrieved, recall is 0",
76+
)
77+
5878
scorer = NonLLMContextRecall()
5979
scorer.distance_measure = {
6080
"levenshtein": DistanceMeasure.LEVENSHTEIN,

evaluators/ragas/tests/test_ragas.py

Lines changed: 105 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,44 @@ def test_context_precision():
177177
assert not result.cost
178178

179179

180+
def test_context_precision_with_empty_contexts():
181+
evaluator = RagasContextPrecisionEvaluator(settings=RagasContextPrecisionSettings())
182+
183+
result = evaluator.evaluate(
184+
RagasContextPrecisionEntry(
185+
contexts=[],
186+
expected_contexts=[],
187+
)
188+
)
189+
assert result.status == "processed"
190+
assert result.score is not None and result.score == 1.0
191+
assert not result.cost
192+
193+
result = evaluator.evaluate(
194+
RagasContextPrecisionEntry(
195+
contexts=[],
196+
expected_contexts=[
197+
"Paris is the capital of France.",
198+
"The Eiffel Tower is one of the most famous landmarks in Paris.",
199+
],
200+
)
201+
)
202+
assert result.status == "processed"
203+
assert result.score is not None and result.score == 0.0
204+
assert not result.cost
205+
206+
result = evaluator.evaluate(
207+
RagasContextPrecisionEntry(
208+
contexts=["The Eiffel Tower is located in Paris."],
209+
expected_contexts=[],
210+
)
211+
)
212+
213+
assert result.status == "processed"
214+
assert result.score is not None and result.score == 0.0
215+
assert not result.cost
216+
217+
180218
def test_context_recall():
181219
evaluator = RagasContextRecallEvaluator(settings=RagasContextRecallSettings())
182220

@@ -195,6 +233,46 @@ def test_context_recall():
195233
assert not result.cost
196234

197235

236+
def test_context_recall_with_empty_contexts():
237+
evaluator = RagasContextRecallEvaluator(settings=RagasContextRecallSettings())
238+
239+
result = evaluator.evaluate(
240+
RagasContextRecallEntry(
241+
contexts=[],
242+
expected_contexts=[],
243+
)
244+
)
245+
246+
assert result.status == "processed"
247+
assert result.score is not None and result.score == 1.0
248+
assert not result.cost
249+
250+
result = evaluator.evaluate(
251+
RagasContextRecallEntry(
252+
contexts=[],
253+
expected_contexts=[
254+
"Paris is the capital of France.",
255+
"The Eiffel Tower is one of the most famous landmarks in Paris.",
256+
],
257+
)
258+
)
259+
260+
assert result.status == "processed"
261+
assert result.score is not None and result.score == 0.0
262+
assert not result.cost
263+
264+
result = evaluator.evaluate(
265+
RagasContextRecallEntry(
266+
contexts=["The Eiffel Tower is located in Paris."],
267+
expected_contexts=[],
268+
)
269+
)
270+
271+
assert result.status == "processed"
272+
assert result.score is not None and result.score == 1.0
273+
assert not result.cost
274+
275+
198276
def test_context_f1():
199277
evaluator = RagasContextF1Evaluator(settings=RagasContextF1Settings())
200278

@@ -214,6 +292,32 @@ def test_context_f1():
214292
assert result.details
215293

216294

295+
def test_context_f1_with_empty_contexts():
296+
evaluator = RagasContextF1Evaluator(settings=RagasContextF1Settings())
297+
298+
result = evaluator.evaluate(RagasContextF1Entry(contexts=[], expected_contexts=[]))
299+
300+
assert result.status == "processed"
301+
assert result.score is not None and result.score == 1.0
302+
assert not result.cost
303+
304+
result = evaluator.evaluate(
305+
RagasContextF1Entry(contexts=[], expected_contexts=["context"])
306+
)
307+
308+
assert result.status == "processed"
309+
assert result.score is not None and result.score == 0.0
310+
assert not result.cost
311+
312+
result = evaluator.evaluate(
313+
RagasContextF1Entry(contexts=["context"], expected_contexts=[])
314+
)
315+
316+
assert result.status == "processed"
317+
assert result.score is not None and result.score == 0.0
318+
assert not result.cost
319+
320+
217321
def test_response_context_precision_with_reference():
218322
evaluator = RagasResponseContextPrecisionEvaluator(settings=RagasSettings())
219323

@@ -319,7 +423,7 @@ def test_summarization_score():
319423
)
320424

321425
assert result.status == "processed"
322-
assert result.score and result.score > 0.7
426+
assert result.score and result.score > 0.6
323427
assert result.cost and result.cost.amount > 0.0
324428
assert result.details
325429

0 commit comments

Comments
 (0)