Skip to content

Commit e98fc9d

Browse files
committed
Change ragas to use gpt-4o-mini by default, as it's good on it now, and better document the relevant outputs for each evaluator
1 parent 1f21804 commit e98fc9d

File tree

8 files changed

+73
-40
lines changed

8 files changed

+73
-40
lines changed

evaluators/langevals/langevals_langevals/basic.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,21 @@ class CustomBasicRule(BaseModel):
2727

2828

2929
class CustomBasicSettings(EvaluatorSettings):
30-
rules: list[CustomBasicRule] = Field(default=[
31-
CustomBasicRule(field="output", rule="not_contains", value="artificial intelligence"),
32-
], description="List of rules to check, the message must pass all of them")
30+
rules: list[CustomBasicRule] = Field(
31+
default=[
32+
CustomBasicRule(
33+
field="output", rule="not_contains", value="artificial intelligence"
34+
),
35+
],
36+
description="List of rules to check, the message must pass all of them",
37+
)
3338

3439

3540
class CustomBasicResult(EvaluationResult):
36-
score: float = Field(description="Returns 1 if all rules pass, 0 if any rule fails")
37-
passed: Optional[bool] = Field(default=True)
41+
score: float
42+
passed: Optional[bool] = Field(
43+
default=True, description="True if all rules pass, False if any rule fails"
44+
)
3845

3946

4047
class CustomBasicEvaluator(

evaluators/langevals/langevals_langevals/llm_boolean.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,10 @@ class CustomLLMBooleanSettings(LLMEvaluatorSettings):
3333

3434

3535
class CustomLLMBooleanResult(EvaluationResult):
36-
score: float = Field(
37-
description="Returns 1 if LLM evaluates it as true, 0 if as false"
36+
score: float
37+
passed: Optional[bool] = Field(
38+
description="The veredict given by the LLM", default=True
3839
)
39-
passed: Optional[bool] = Field(description="The veredict given by the LLM", default=True)
4040

4141

4242
class CustomLLMBooleanEvaluator(

evaluators/lingua/langevals_lingua/language_detection.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ class LinguaLanguageDetectionRawResponse(BaseModel):
119119

120120

121121
class LinguaLanguageDetectionResult(EvaluationResult):
122-
score: float = Field(description="How many languages were detected")
122+
score: float
123123
passed: Optional[bool] = Field(
124124
description="Passes if the detected language on the output matches the detected language on the input, or if the output matches the expected language",
125125
default=None,

evaluators/openai/langevals_openai/moderation.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ class OpenAIModerationSettings(EvaluatorSettings):
4545

4646

4747
class OpenAIModerationResult(EvaluationResult):
48+
passed: Optional[bool] = Field(
49+
description="Fails if any moderation category is flagged",
50+
default=None,
51+
)
4852
score: float = Field(
4953
description="The model's confidence on primary category where the input violates the OpenAI's policy. The value is between 0 and 1, where higher values denote higher confidence."
5054
)

evaluators/ragas/langevals_ragas/answer_correctness.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,32 @@
1+
from typing import Optional
12
from langevals_core.base_evaluator import (
23
BaseEvaluator,
4+
EvaluationResult,
35
EvaluatorEntry,
46
SingleEvaluationResult,
57
EvaluationResultSkipped,
68
)
79
from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
10+
from pydantic import Field
811

912

1013
class RagasAnswerCorrectnessEntry(EvaluatorEntry):
11-
input: str
14+
input: Optional[str] = Field(default="")
1215
output: str
1316
expected_output: str
1417

1518

19+
class RagasAnswerCorrectnessResult(EvaluationResult):
20+
score: float = Field(
21+
description="A score between 0.0 and 1.0 indicating the correctness of the answer."
22+
)
23+
24+
1625
class RagasAnswerCorrectnessEvaluator(
17-
BaseEvaluator[RagasAnswerCorrectnessEntry, RagasSettings, RagasResult]
26+
BaseEvaluator[RagasAnswerCorrectnessEntry, RagasSettings, RagasAnswerCorrectnessResult]
1827
):
1928
"""
20-
This evaluator focuses on assessing how pertinent the generated answer is to the given prompt. Higher scores indicate better Correctness.
29+
Computes with an LLM a weighted combination of factual as well as semantic similarity between the generated answer and the expected output.
2130
"""
2231

2332
name = "Ragas Answer Correctness"
@@ -30,13 +39,11 @@ class RagasAnswerCorrectnessEvaluator(
3039
is_guardrail = False
3140

3241
def evaluate(self, entry: RagasAnswerCorrectnessEntry) -> SingleEvaluationResult:
33-
content = entry.input or ""
34-
if not content:
35-
return EvaluationResultSkipped(details="Input is empty")
42+
input = entry.input or ""
3643
return evaluate_ragas(
3744
evaluator=self,
3845
metric="answer_correctness",
39-
question=entry.input,
46+
question=input,
4047
answer=entry.output,
4148
ground_truth=entry.expected_output,
4249
settings=self.settings,

evaluators/ragas/langevals_ragas/answer_relevancy.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,29 @@
11
from langevals_core.base_evaluator import (
22
BaseEvaluator,
3+
EvaluationResult,
34
EvaluatorEntry,
45
SingleEvaluationResult,
56
)
67
from .lib.common import env_vars, evaluate_ragas, RagasSettings, RagasResult
8+
from pydantic import Field
79

810

911
class RagasAnswerRelevancyEntry(EvaluatorEntry):
1012
input: str
1113
output: str
1214

1315

16+
class RagasAnswerRelevancyResult(EvaluationResult):
17+
score: float = Field(
18+
description="A score between 0.0 and 1.0 indicating the relevance of the answer."
19+
)
20+
21+
1422
class RagasAnswerRelevancyEvaluator(
15-
BaseEvaluator[RagasAnswerRelevancyEntry, RagasSettings, RagasResult]
23+
BaseEvaluator[RagasAnswerRelevancyEntry, RagasSettings, RagasAnswerRelevancyResult]
1624
):
1725
"""
18-
This evaluator focuses on assessing how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy.
26+
Evaluates how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy.
1927
"""
2028

2129
name = "Ragas Answer Relevancy"

evaluators/ragas/langevals_ragas/lib/common.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ class RagasSettings(EvaluatorSettings):
5151
"azure/gpt-4o-mini",
5252
"anthropic/claude-3-5-sonnet-20240620",
5353
] = Field(
54-
default="openai/gpt-3.5-turbo-16k",
54+
default="openai/gpt-4o-mini",
5555
description="The model to use for evaluation.",
5656
)
5757
embeddings_model: Literal[

ts-integration/evaluators.generated.ts

Lines changed: 28 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -691,9 +691,6 @@ or if it's in a specific expected language.
691691
},
692692
},
693693
result: {
694-
score: {
695-
description: "How many languages were detected",
696-
},
697694
passed: {
698695
description:
699696
"Passes if the detected language on the output matches the detected language on the input, or if the output matches the expected language",
@@ -925,18 +922,18 @@ social security numbers. It allows customization of the detection threshold and
925922
"ragas/answer_correctness": {
926923
name: `Ragas Answer Correctness`,
927924
description: `
928-
This evaluator focuses on assessing how pertinent the generated answer is to the given prompt. Higher scores indicate better Correctness.
925+
Computes with an LLM a weighted combination of factual as well as semantic similarity between the generated answer and the expected output.
929926
`,
930927
category: "rag",
931928
docsUrl:
932929
"https://docs.ragas.io/en/latest/concepts/metrics/answer_correctness.html",
933930
isGuardrail: false,
934-
requiredFields: ["input", "output", "expected_output"],
935-
optionalFields: [],
931+
requiredFields: ["output", "expected_output"],
932+
optionalFields: ["input"],
936933
settings: {
937934
model: {
938935
description: "The model to use for evaluation.",
939-
default: "openai/gpt-3.5-turbo-16k",
936+
default: "openai/gpt-4o-mini",
940937
},
941938
embeddings_model: {
942939
description: "The model to use for embeddings.",
@@ -948,12 +945,17 @@ This evaluator focuses on assessing how pertinent the generated answer is to the
948945
default: 2048,
949946
},
950947
},
951-
result: {},
948+
result: {
949+
score: {
950+
description:
951+
"A score between 0.0 and 1.0 indicating the correctness of the answer.",
952+
},
953+
},
952954
},
953955
"ragas/answer_relevancy": {
954956
name: `Ragas Answer Relevancy`,
955957
description: `
956-
This evaluator focuses on assessing how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy.
958+
Evaluates how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy.
957959
`,
958960
category: "rag",
959961
docsUrl:
@@ -964,7 +966,7 @@ This evaluator focuses on assessing how pertinent the generated answer is to the
964966
settings: {
965967
model: {
966968
description: "The model to use for evaluation.",
967-
default: "openai/gpt-3.5-turbo-16k",
969+
default: "openai/gpt-4o-mini",
968970
},
969971
embeddings_model: {
970972
description: "The model to use for embeddings.",
@@ -976,7 +978,12 @@ This evaluator focuses on assessing how pertinent the generated answer is to the
976978
default: 2048,
977979
},
978980
},
979-
result: {},
981+
result: {
982+
score: {
983+
description:
984+
"A score between 0.0 and 1.0 indicating the relevance of the answer.",
985+
},
986+
},
980987
},
981988
"ragas/context_precision": {
982989
name: `Ragas Context Precision`,
@@ -992,7 +999,7 @@ This metric evaluates whether all of the ground-truth relevant items present in
992999
settings: {
9931000
model: {
9941001
description: "The model to use for evaluation.",
995-
default: "openai/gpt-3.5-turbo-16k",
1002+
default: "openai/gpt-4o-mini",
9961003
},
9971004
embeddings_model: {
9981005
description: "The model to use for embeddings.",
@@ -1020,7 +1027,7 @@ This evaluator measures the extent to which the retrieved context aligns with th
10201027
settings: {
10211028
model: {
10221029
description: "The model to use for evaluation.",
1023-
default: "openai/gpt-3.5-turbo-16k",
1030+
default: "openai/gpt-4o-mini",
10241031
},
10251032
embeddings_model: {
10261033
description: "The model to use for embeddings.",
@@ -1048,7 +1055,7 @@ This metric gauges the relevancy of the retrieved context, calculated based on b
10481055
settings: {
10491056
model: {
10501057
description: "The model to use for evaluation.",
1051-
default: "openai/gpt-3.5-turbo-16k",
1058+
default: "openai/gpt-4o-mini",
10521059
},
10531060
embeddings_model: {
10541061
description: "The model to use for embeddings.",
@@ -1076,7 +1083,7 @@ This metric evaluates whether all of the output relevant items present in the co
10761083
settings: {
10771084
model: {
10781085
description: "The model to use for evaluation.",
1079-
default: "openai/gpt-3.5-turbo-16k",
1086+
default: "openai/gpt-4o-mini",
10801087
},
10811088
embeddings_model: {
10821089
description: "The model to use for embeddings.",
@@ -1104,7 +1111,7 @@ This evaluator assesses the extent to which the generated answer is consistent w
11041111
settings: {
11051112
model: {
11061113
description: "The model to use for evaluation.",
1107-
default: "openai/gpt-3.5-turbo-16k",
1114+
default: "openai/gpt-4o-mini",
11081115
},
11091116
embeddings_model: {
11101117
description: "The model to use for embeddings.",
@@ -1142,8 +1149,8 @@ Allows you to check for simple text matches or regex evaluation.
11421149
},
11431150
},
11441151
result: {
1145-
score: {
1146-
description: "Returns 1 if all rules pass, 0 if any rule fails",
1152+
passed: {
1153+
description: "True if all rules pass, False if any rule fails",
11471154
},
11481155
},
11491156
},
@@ -1279,9 +1286,6 @@ Use an LLM as a judge with a custom prompt to do a true/false boolean evaluation
12791286
},
12801287
},
12811288
result: {
1282-
score: {
1283-
description: "Returns 1 if LLM evaluates it as true, 0 if as false",
1284-
},
12851289
passed: {
12861290
description: "The veredict given by the LLM",
12871291
},
@@ -1575,6 +1579,9 @@ including harassment, hate speech, self-harm, sexual content, and violence.
15751579
description:
15761580
"The model's confidence on primary category where the input violates the OpenAI's policy. The value is between 0 and 1, where higher values denote higher confidence.",
15771581
},
1582+
passed: {
1583+
description: "Fails if any moderation category is flagged",
1584+
},
15781585
},
15791586
},
15801587
"example/word_count": {

0 commit comments

Comments
 (0)