You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
score: float=Field(description="How many languages were detected")
122
+
score: float
123
123
passed: Optional[bool] =Field(
124
124
description="Passes if the detected language on the output matches the detected language on the input, or if the output matches the expected language",
Copy file name to clipboardExpand all lines: evaluators/openai/langevals_openai/moderation.py
+4Lines changed: 4 additions & 0 deletions
Original file line number
Diff line number
Diff line change
@@ -45,6 +45,10 @@ class OpenAIModerationSettings(EvaluatorSettings):
45
45
46
46
47
47
classOpenAIModerationResult(EvaluationResult):
48
+
passed: Optional[bool] =Field(
49
+
description="Fails if any moderation category is flagged",
50
+
default=None,
51
+
)
48
52
score: float=Field(
49
53
description="The model's confidence on primary category where the input violates the OpenAI's policy. The value is between 0 and 1, where higher values denote higher confidence."
@@ -948,12 +945,17 @@ This evaluator focuses on assessing how pertinent the generated answer is to the
948
945
default: 2048,
949
946
},
950
947
},
951
-
result: {},
948
+
result: {
949
+
score: {
950
+
description:
951
+
"A score between 0.0 and 1.0 indicating the correctness of the answer.",
952
+
},
953
+
},
952
954
},
953
955
"ragas/answer_relevancy": {
954
956
name: `Ragas Answer Relevancy`,
955
957
description: `
956
-
This evaluator focuses on assessing how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy.
958
+
Evaluates how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy.
957
959
`,
958
960
category: "rag",
959
961
docsUrl:
@@ -964,7 +966,7 @@ This evaluator focuses on assessing how pertinent the generated answer is to the
964
966
settings: {
965
967
model: {
966
968
description: "The model to use for evaluation.",
967
-
default: "openai/gpt-3.5-turbo-16k",
969
+
default: "openai/gpt-4o-mini",
968
970
},
969
971
embeddings_model: {
970
972
description: "The model to use for embeddings.",
@@ -976,7 +978,12 @@ This evaluator focuses on assessing how pertinent the generated answer is to the
976
978
default: 2048,
977
979
},
978
980
},
979
-
result: {},
981
+
result: {
982
+
score: {
983
+
description:
984
+
"A score between 0.0 and 1.0 indicating the relevance of the answer.",
985
+
},
986
+
},
980
987
},
981
988
"ragas/context_precision": {
982
989
name: `Ragas Context Precision`,
@@ -992,7 +999,7 @@ This metric evaluates whether all of the ground-truth relevant items present in
992
999
settings: {
993
1000
model: {
994
1001
description: "The model to use for evaluation.",
995
-
default: "openai/gpt-3.5-turbo-16k",
1002
+
default: "openai/gpt-4o-mini",
996
1003
},
997
1004
embeddings_model: {
998
1005
description: "The model to use for embeddings.",
@@ -1020,7 +1027,7 @@ This evaluator measures the extent to which the retrieved context aligns with th
1020
1027
settings: {
1021
1028
model: {
1022
1029
description: "The model to use for evaluation.",
1023
-
default: "openai/gpt-3.5-turbo-16k",
1030
+
default: "openai/gpt-4o-mini",
1024
1031
},
1025
1032
embeddings_model: {
1026
1033
description: "The model to use for embeddings.",
@@ -1048,7 +1055,7 @@ This metric gauges the relevancy of the retrieved context, calculated based on b
1048
1055
settings: {
1049
1056
model: {
1050
1057
description: "The model to use for evaluation.",
1051
-
default: "openai/gpt-3.5-turbo-16k",
1058
+
default: "openai/gpt-4o-mini",
1052
1059
},
1053
1060
embeddings_model: {
1054
1061
description: "The model to use for embeddings.",
@@ -1076,7 +1083,7 @@ This metric evaluates whether all of the output relevant items present in the co
1076
1083
settings: {
1077
1084
model: {
1078
1085
description: "The model to use for evaluation.",
1079
-
default: "openai/gpt-3.5-turbo-16k",
1086
+
default: "openai/gpt-4o-mini",
1080
1087
},
1081
1088
embeddings_model: {
1082
1089
description: "The model to use for embeddings.",
@@ -1104,7 +1111,7 @@ This evaluator assesses the extent to which the generated answer is consistent w
1104
1111
settings: {
1105
1112
model: {
1106
1113
description: "The model to use for evaluation.",
1107
-
default: "openai/gpt-3.5-turbo-16k",
1114
+
default: "openai/gpt-4o-mini",
1108
1115
},
1109
1116
embeddings_model: {
1110
1117
description: "The model to use for embeddings.",
@@ -1142,8 +1149,8 @@ Allows you to check for simple text matches or regex evaluation.
1142
1149
},
1143
1150
},
1144
1151
result: {
1145
-
score: {
1146
-
description: "Returns 1 if all rules pass, 0 if any rule fails",
1152
+
passed: {
1153
+
description: "True if all rules pass, False if any rule fails",
1147
1154
},
1148
1155
},
1149
1156
},
@@ -1279,9 +1286,6 @@ Use an LLM as a judge with a custom prompt to do a true/false boolean evaluation
1279
1286
},
1280
1287
},
1281
1288
result: {
1282
-
score: {
1283
-
description: "Returns 1 if LLM evaluates it as true, 0 if as false",
1284
-
},
1285
1289
passed: {
1286
1290
description: "The veredict given by the LLM",
1287
1291
},
@@ -1575,6 +1579,9 @@ including harassment, hate speech, self-harm, sexual content, and violence.
1575
1579
description:
1576
1580
"The model's confidence on primary category where the input violates the OpenAI's policy. The value is between 0 and 1, where higher values denote higher confidence.",
1577
1581
},
1582
+
passed: {
1583
+
description: "Fails if any moderation category is flagged",
0 commit comments