Skip to content

Commit 1e3effd

Browse files
committed
Fix ragas relevancy description for better explainability
1 parent e698bdb commit 1e3effd

File tree

3 files changed

+34
-11
lines changed

3 files changed

+34
-11
lines changed

evaluators/ragas/langevals_ragas/lib/common.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,13 +50,17 @@ class _GenericEvaluatorEntry(EvaluatorEntry):
5050
expected_output: Optional[str]
5151

5252

53-
def prepare_llm(evaluator: BaseEvaluator, settings: RagasSettings = RagasSettings()):
53+
def prepare_llm(
54+
evaluator: BaseEvaluator,
55+
settings: RagasSettings = RagasSettings(),
56+
temperature: float = 0,
57+
):
5458
os.environ["AZURE_API_VERSION"] = "2023-07-01-preview"
5559
if evaluator.env:
5660
for key, env in evaluator.env.items():
5761
os.environ[key] = env
5862

59-
gpt = model_to_langchain(settings.model)
63+
gpt = model_to_langchain(settings.model, temperature=temperature)
6064
llm = LangchainLLMWrapper(langchain_llm=gpt)
6165

6266
if hasattr(settings, "embeddings_model"):

evaluators/ragas/langevals_ragas/lib/model_to_langchain.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,15 @@
1212

1313
class LitellmCompletion:
1414
exception: Optional[Exception] = None
15+
temperature: float = 0
16+
17+
def __init__(self, temperature: float = 0):
18+
self.temperature = temperature
1519

1620
def create(self, *args, **kwargs):
1721
try:
22+
if self.temperature:
23+
kwargs["temperature"] = self.temperature
1824
return litellm.completion(*args, **kwargs)
1925
except Exception as e:
2026
self.exception = e
@@ -28,16 +34,17 @@ async def create(self, *args, **kwargs):
2834

2935
def model_to_langchain(
3036
model: str,
37+
temperature: float = 0,
3138
) -> BaseChatModel:
3239
if model.startswith("claude-"):
3340
model = model.replace("claude-", "anthropic/claude-")
3441

3542
return ChatOpenAI(
3643
model=model,
3744
api_key="dummy", # type: ignore
38-
client=LitellmCompletion(),
39-
async_client=AsyncLitellmCompletion(),
40-
temperature=0,
45+
temperature=temperature or 0,
46+
client=LitellmCompletion(temperature=temperature),
47+
async_client=AsyncLitellmCompletion(temperature=temperature),
4148
)
4249

4350

evaluators/ragas/langevals_ragas/response_relevancy.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from typing import Sequence
12
from langevals_core.base_evaluator import (
23
BaseEvaluator,
34
EvaluationResult,
@@ -16,6 +17,7 @@
1617
)
1718
from pydantic import Field
1819
from ragas.metrics import ResponseRelevancy
20+
from ragas.metrics._answer_relevance import ResponseRelevanceOutput
1921

2022

2123
class RagasResponseRelevancyEntry(EvaluatorEntry):
@@ -56,7 +58,7 @@ class RagasResponseRelevancyEvaluator(
5658
is_guardrail = False
5759

5860
def evaluate(self, entry: RagasResponseRelevancyEntry) -> SingleEvaluationResult:
59-
llm, embeddings = prepare_llm(self, self.settings)
61+
llm, embeddings = prepare_llm(self, self.settings, temperature=0.7)
6062

6163
skip = check_max_tokens(
6264
input=entry.input,
@@ -69,17 +71,23 @@ def evaluate(self, entry: RagasResponseRelevancyEntry) -> SingleEvaluationResult
6971
scorer = ResponseRelevancy(llm=llm, embeddings=embeddings)
7072

7173
_original_calculate_similarity = scorer.calculate_similarity
74+
_original_calculate_score = scorer._calculate_score
7275

73-
breakdown = {"similarity": 0, "generated_questions": []}
76+
breakdown = {"similarity": 0, "answers": []}
7477

7578
def calculate_similarity(question: str, generated_questions):
7679
nonlocal breakdown
77-
breakdown["generated_questions"] += generated_questions
7880
similarity = _original_calculate_similarity(question, generated_questions)
7981
breakdown["similarity"] += similarity
8082
return similarity
8183

84+
def _calculate_score(answers: Sequence[ResponseRelevanceOutput], row: dict):
85+
nonlocal breakdown
86+
breakdown["answers"] += answers
87+
return _original_calculate_score(answers, row)
88+
8289
scorer.calculate_similarity = calculate_similarity
90+
scorer._calculate_score = _calculate_score
8391

8492
with capture_cost(llm) as cost:
8593
score = scorer.single_turn_score(
@@ -89,15 +97,19 @@ def calculate_similarity(question: str, generated_questions):
8997
)
9098
)
9199

92-
generated_questions = "\n- ".join(breakdown["generated_questions"])
100+
generated_questions = "\n".join(
101+
[f"- {answer.question}" for answer in breakdown["answers"]]
102+
)
93103

94-
if len(breakdown["generated_questions"]) == 0:
104+
if len([answer for answer in breakdown["answers"] if answer.question]) == 0:
95105
return EvaluationResultSkipped(
96106
details="No questions could be generated from output.",
97107
)
98108

109+
any_noncommittal = any([answer.noncommittal for answer in breakdown["answers"]])
110+
99111
return RagasResult(
100112
score=score,
101113
cost=cost,
102-
details=f"Questions generated from output:\n{generated_questions}\nSimilarity to original question: {breakdown['similarity']}",
114+
details=f"Questions generated from output:\n\n{generated_questions}\n\nSimilarity to original question: {breakdown['similarity']}\nEvasive answer: {any_noncommittal}",
103115
)

0 commit comments

Comments
 (0)