From cefb25a1e4e4cc493886293bb9960c7f788e2755 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rog=C3=A9rio=20Chaves?= Date: Tue, 2 Sep 2025 13:09:03 +0200 Subject: [PATCH 1/2] fix: use X_LITELLM instead of plain LITELLM to not mix up with official env vars --- .../langevals/tests/test_llm_boolean.py | 4 ++-- .../langevals/tests/test_llm_category.py | 4 ++-- evaluators/langevals/tests/test_llm_score.py | 4 ++-- .../langevals_core/base_evaluator.py | 2 +- .../langevals_core/litellm_patch.py | 20 ++++++++++++------- 5 files changed, 20 insertions(+), 14 deletions(-) diff --git a/evaluators/langevals/tests/test_llm_boolean.py b/evaluators/langevals/tests/test_llm_boolean.py index 69ce136..1907ac4 100644 --- a/evaluators/langevals/tests/test_llm_boolean.py +++ b/evaluators/langevals/tests/test_llm_boolean.py @@ -80,8 +80,8 @@ def test_llm_as_judge_atla_ai(): prompt="Is the recipe vegetarian?", ), env={ - "LITELLM_api_key": os.getenv("ATLA_API_KEY", ""), - "LITELLM_api_base": "https://api.atla-ai.com/v1", + "X_LITELLM_api_key": os.getenv("ATLA_API_KEY", ""), + "X_LITELLM_api_base": "https://api.atla-ai.com/v1", }, ) diff --git a/evaluators/langevals/tests/test_llm_category.py b/evaluators/langevals/tests/test_llm_category.py index 001cb09..f0606c1 100644 --- a/evaluators/langevals/tests/test_llm_category.py +++ b/evaluators/langevals/tests/test_llm_category.py @@ -89,8 +89,8 @@ def test_llm_as_judge_atla_ai(): ], ), env={ - "LITELLM_api_key": os.getenv("ATLA_API_KEY", ""), - "LITELLM_api_base": "https://api.atla-ai.com/v1", + "X_LITELLM_api_key": os.getenv("ATLA_API_KEY", ""), + "X_LITELLM_api_base": "https://api.atla-ai.com/v1", }, ) diff --git a/evaluators/langevals/tests/test_llm_score.py b/evaluators/langevals/tests/test_llm_score.py index 7837569..bbd72c7 100644 --- a/evaluators/langevals/tests/test_llm_score.py +++ b/evaluators/langevals/tests/test_llm_score.py @@ -78,8 +78,8 @@ def test_llm_as_judge_atla_ai(): prompt="You are an LLM evaluator. Please score from 0.0 to 1.0 how likely the user is to be satisfied with this answer, from 0.0 being not satisfied at all to 1.0 being completely satisfied.", ), env={ - "LITELLM_api_key": os.getenv("ATLA_API_KEY", ""), - "LITELLM_api_base": "https://api.atla-ai.com/v1", + "X_LITELLM_api_key": os.getenv("ATLA_API_KEY", ""), + "X_LITELLM_api_base": "https://api.atla-ai.com/v1", }, ) result = evaluator.evaluate( diff --git a/langevals_core/langevals_core/base_evaluator.py b/langevals_core/langevals_core/base_evaluator.py index f259d05..84d0a0d 100644 --- a/langevals_core/langevals_core/base_evaluator.py +++ b/langevals_core/langevals_core/base_evaluator.py @@ -246,7 +246,7 @@ def get_env(self, var: str): def set_model_envs(self): # Those variables may be used non-explicitly, so we need to set them globally here for the arguments given for key, value in (self.env or {}).items(): - if key in models_env_vars or key.startswith("LITELLM_"): + if key in models_env_vars or key.startswith("X_LITELLM_"): os.environ[key] = value # azure alias for litellm diff --git a/langevals_core/langevals_core/litellm_patch.py b/langevals_core/langevals_core/litellm_patch.py index 16ba867..6e96eca 100644 --- a/langevals_core/langevals_core/litellm_patch.py +++ b/langevals_core/langevals_core/litellm_patch.py @@ -29,12 +29,14 @@ def patched_completion(*args, **kwargs): for key, value in os.environ.items(): if ( - key.startswith("LITELLM_") - and not key.startswith("LITELLM_EMBEDDINGS_") - and key != "LITELLM_LOG" - and key != "LITELLM_LOCAL_MODEL_COST_MAP" + key.startswith("X_LITELLM_") + and not key.startswith("X_LITELLM_EMBEDDINGS_") ): - kwargs[key.replace("LITELLM_", "")] = value + replaced_key = key.replace("X_LITELLM_", "") + # check if key is all uppercase, likely not a litellm key and got here by accident + if replaced_key.isupper(): + continue + kwargs[replaced_key] = value return _original_completion(*args, **kwargs) @@ -49,8 +51,12 @@ def patched_embedding(*args, **kwargs): # if os.environ.get("GOOGLE_APPLICATION_CREDENTIALS") is not None: # kwargs["vertex_credentials"] = os.environ["GOOGLE_APPLICATION_CREDENTIALS"] for key, value in os.environ.items(): - if key.startswith("LITELLM_EMBEDDINGS_"): - kwargs[key.replace("LITELLM_EMBEDDINGS_", "")] = value + if key.startswith("X_LITELLM_EMBEDDINGS_"): + replaced_key = key.replace("X_LITELLM_EMBEDDINGS_", "") + # check if key is all uppercase, likely not a litellm key and got here by accident + if replaced_key.isupper(): + continue + kwargs[replaced_key] = value return _original_embedding(*args, **kwargs) litellm.embedding = patched_embedding From 0301782eddf15700b31ae53585f45fd09d55077e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rog=C3=A9rio=20Chaves?= Date: Tue, 2 Sep 2025 13:12:54 +0200 Subject: [PATCH 2/2] bump: bump all versions --- evaluators/azure/poetry.lock | 2 +- evaluators/azure/pyproject.toml | 2 +- evaluators/example/poetry.lock | 2 +- evaluators/example/pyproject.toml | 2 +- evaluators/huggingface/poetry.lock | 2 +- evaluators/huggingface/pyproject.toml | 2 +- evaluators/langevals/poetry.lock | 2 +- evaluators/langevals/pyproject.toml | 2 +- evaluators/legacy/poetry.lock | 2 +- evaluators/legacy/pyproject.toml | 2 +- evaluators/lingua/poetry.lock | 2 +- evaluators/lingua/pyproject.toml | 2 +- evaluators/openai/poetry.lock | 2 +- evaluators/openai/pyproject.toml | 2 +- evaluators/presidio/poetry.lock | 2 +- evaluators/presidio/pyproject.toml | 2 +- evaluators/ragas/poetry.lock | 2 +- evaluators/ragas/pyproject.toml | 2 +- langevals_core/pyproject.toml | 2 +- poetry.lock | 20 +- ts-integration/evaluators.generated.ts | 2410 ++++++++++++------------ 21 files changed, 1234 insertions(+), 1234 deletions(-) diff --git a/evaluators/azure/poetry.lock b/evaluators/azure/poetry.lock index be03901..86e60fd 100644 --- a/evaluators/azure/poetry.lock +++ b/evaluators/azure/poetry.lock @@ -1217,7 +1217,7 @@ referencing = ">=0.31.0" [[package]] name = "langevals-core" -version = "0.1.14" +version = "0.1.15" description = "Core package for LLM evaluation platform, providing base classes and utilities." optional = false python-versions = "^3.11" diff --git a/evaluators/azure/pyproject.toml b/evaluators/azure/pyproject.toml index 7a5e7d6..efb48f0 100644 --- a/evaluators/azure/pyproject.toml +++ b/evaluators/azure/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langevals-azure" -version = "0.1.11" +version = "0.1.12" description = "LangEvals Azure Content Safety evaluator for LLM outputs." authors = ["Rogerio Chaves ", "Yevhenii Budnyk "] license = "MIT" diff --git a/evaluators/example/poetry.lock b/evaluators/example/poetry.lock index ff46104..7476215 100644 --- a/evaluators/example/poetry.lock +++ b/evaluators/example/poetry.lock @@ -1169,7 +1169,7 @@ referencing = ">=0.31.0" [[package]] name = "langevals-core" -version = "0.1.14" +version = "0.1.15" description = "Core package for LLM evaluation platform, providing base classes and utilities." optional = false python-versions = "^3.11" diff --git a/evaluators/example/pyproject.toml b/evaluators/example/pyproject.toml index 354145d..1fcabe3 100644 --- a/evaluators/example/pyproject.toml +++ b/evaluators/example/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langevals-example" -version = "0.1.10" +version = "0.1.11" description = "LangEvals boilerplate example evaluator for LLMs." authors = ["Your Name "] license = "MIT" diff --git a/evaluators/huggingface/poetry.lock b/evaluators/huggingface/poetry.lock index 60b715d..4aa2646 100644 --- a/evaluators/huggingface/poetry.lock +++ b/evaluators/huggingface/poetry.lock @@ -1165,7 +1165,7 @@ referencing = ">=0.31.0" [[package]] name = "langevals-core" -version = "0.1.14" +version = "0.1.15" description = "Core package for LLM evaluation platform, providing base classes and utilities." optional = false python-versions = "^3.11" diff --git a/evaluators/huggingface/pyproject.toml b/evaluators/huggingface/pyproject.toml index fd4263c..b5b06dd 100644 --- a/evaluators/huggingface/pyproject.toml +++ b/evaluators/huggingface/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langevals-huggingface" -version = "0.1.9" +version = "0.1.10" description = "Evaluators based on models available on huggingface." authors = [ "Rogerio Chaves ", diff --git a/evaluators/langevals/poetry.lock b/evaluators/langevals/poetry.lock index a39b454..10ad540 100644 --- a/evaluators/langevals/poetry.lock +++ b/evaluators/langevals/poetry.lock @@ -1445,7 +1445,7 @@ referencing = ">=0.31.0" [[package]] name = "langevals-core" -version = "0.1.14" +version = "0.1.15" description = "Core package for LLM evaluation platform, providing base classes and utilities." optional = false python-versions = "^3.11" diff --git a/evaluators/langevals/pyproject.toml b/evaluators/langevals/pyproject.toml index bcb8fe2..06bab60 100644 --- a/evaluators/langevals/pyproject.toml +++ b/evaluators/langevals/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langevals-langevals" -version = "0.1.10" +version = "0.1.11" description = "LangEvals core evaluators." authors = ["Rogerio Chaves ", "Yevhenii Budnyk "] license = "MIT" diff --git a/evaluators/legacy/poetry.lock b/evaluators/legacy/poetry.lock index 8647dc6..95be25f 100644 --- a/evaluators/legacy/poetry.lock +++ b/evaluators/legacy/poetry.lock @@ -1847,7 +1847,7 @@ langchain-core = ">=0.3.34,<1.0.0" [[package]] name = "langevals-core" -version = "0.1.14" +version = "0.1.15" description = "Core package for LLM evaluation platform, providing base classes and utilities." optional = false python-versions = "^3.11" diff --git a/evaluators/legacy/pyproject.toml b/evaluators/legacy/pyproject.toml index 99fa766..3bb5f21 100644 --- a/evaluators/legacy/pyproject.toml +++ b/evaluators/legacy/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langevals-legacy" -version = "0.1.11" +version = "0.1.12" description = "LangEvals Legacy evaluator" authors = [ "Rogerio Chaves ", diff --git a/evaluators/lingua/poetry.lock b/evaluators/lingua/poetry.lock index 640cb2a..c7ea710 100644 --- a/evaluators/lingua/poetry.lock +++ b/evaluators/lingua/poetry.lock @@ -1169,7 +1169,7 @@ referencing = ">=0.31.0" [[package]] name = "langevals-core" -version = "0.1.14" +version = "0.1.15" description = "Core package for LLM evaluation platform, providing base classes and utilities." optional = false python-versions = "^3.11" diff --git a/evaluators/lingua/pyproject.toml b/evaluators/lingua/pyproject.toml index 02ccc1f..d06b59f 100644 --- a/evaluators/lingua/pyproject.toml +++ b/evaluators/lingua/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langevals-lingua" -version = "0.1.9" +version = "0.1.10" description = "LangEvals lingua evaluator for language detection." authors = ["Rogerio Chaves "] license = "MIT" diff --git a/evaluators/openai/poetry.lock b/evaluators/openai/poetry.lock index c15095f..cab3141 100644 --- a/evaluators/openai/poetry.lock +++ b/evaluators/openai/poetry.lock @@ -1169,7 +1169,7 @@ referencing = ">=0.31.0" [[package]] name = "langevals-core" -version = "0.1.14" +version = "0.1.15" description = "Core package for LLM evaluation platform, providing base classes and utilities." optional = false python-versions = "^3.11" diff --git a/evaluators/openai/pyproject.toml b/evaluators/openai/pyproject.toml index b753f8f..89e5dfd 100644 --- a/evaluators/openai/pyproject.toml +++ b/evaluators/openai/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langevals-openai" -version = "0.1.10" +version = "0.1.11" description = "LangEvals OpenAI moderation evaluator for LLM outputs." authors = ["Rogerio Chaves "] license = "MIT" diff --git a/evaluators/presidio/poetry.lock b/evaluators/presidio/poetry.lock index 04a916c..162c5c9 100644 --- a/evaluators/presidio/poetry.lock +++ b/evaluators/presidio/poetry.lock @@ -1634,7 +1634,7 @@ test = ["pytest", "pytest-cov"] [[package]] name = "langevals-core" -version = "0.1.14" +version = "0.1.15" description = "Core package for LLM evaluation platform, providing base classes and utilities." optional = false python-versions = "^3.11" diff --git a/evaluators/presidio/pyproject.toml b/evaluators/presidio/pyproject.toml index c507291..8ce52d1 100644 --- a/evaluators/presidio/pyproject.toml +++ b/evaluators/presidio/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langevals-presidio" -version = "0.1.4" +version = "0.1.5" description = "LangEvals integration for Presidio" authors = ["Rogerio Chaves "] license = "MIT" diff --git a/evaluators/ragas/poetry.lock b/evaluators/ragas/poetry.lock index 4cb45ef..91805b8 100644 --- a/evaluators/ragas/poetry.lock +++ b/evaluators/ragas/poetry.lock @@ -1895,7 +1895,7 @@ langchain-core = ">=0.3.34,<1.0.0" [[package]] name = "langevals-core" -version = "0.1.14" +version = "0.1.15" description = "Core package for LLM evaluation platform, providing base classes and utilities." optional = false python-versions = "^3.11" diff --git a/evaluators/ragas/pyproject.toml b/evaluators/ragas/pyproject.toml index 2f987e4..2d3e189 100644 --- a/evaluators/ragas/pyproject.toml +++ b/evaluators/ragas/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langevals-ragas" -version = "0.1.11" +version = "0.1.12" description = "LangEvals Ragas evaluator" authors = [ "Rogerio Chaves ", diff --git a/langevals_core/pyproject.toml b/langevals_core/pyproject.toml index e1b4e99..a979d85 100644 --- a/langevals_core/pyproject.toml +++ b/langevals_core/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langevals-core" -version = "0.1.14" +version = "0.1.15" description = "Core package for LLM evaluation platform, providing base classes and utilities." authors = [ "Rogerio Chaves ", diff --git a/poetry.lock b/poetry.lock index 284910c..d42a7cd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2202,7 +2202,7 @@ test = ["pytest", "pytest-cov"] [[package]] name = "langevals-azure" -version = "0.1.11" +version = "0.1.12" description = "LangEvals Azure Content Safety evaluator for LLM outputs." optional = true python-versions = "^3.11" @@ -2220,7 +2220,7 @@ url = "evaluators/azure" [[package]] name = "langevals-core" -version = "0.1.14" +version = "0.1.15" description = "Core package for LLM evaluation platform, providing base classes and utilities." optional = false python-versions = "^3.11" @@ -2244,7 +2244,7 @@ url = "langevals_core" [[package]] name = "langevals-example" -version = "0.1.10" +version = "0.1.11" description = "LangEvals boilerplate example evaluator for LLMs." optional = true python-versions = "^3.11" @@ -2260,7 +2260,7 @@ url = "evaluators/example" [[package]] name = "langevals-huggingface" -version = "0.1.9" +version = "0.1.10" description = "Evaluators based on models available on huggingface." optional = true python-versions = "^3.11" @@ -2278,7 +2278,7 @@ url = "evaluators/huggingface" [[package]] name = "langevals-langevals" -version = "0.1.10" +version = "0.1.11" description = "LangEvals core evaluators." optional = true python-versions = ">=3.11,<3.13" @@ -2301,7 +2301,7 @@ url = "evaluators/langevals" [[package]] name = "langevals-legacy" -version = "0.1.11" +version = "0.1.12" description = "LangEvals Legacy evaluator" optional = true python-versions = "^3.11" @@ -2323,7 +2323,7 @@ url = "evaluators/legacy" [[package]] name = "langevals-lingua" -version = "0.1.9" +version = "0.1.10" description = "LangEvals lingua evaluator for language detection." optional = true python-versions = "^3.11" @@ -2340,7 +2340,7 @@ url = "evaluators/lingua" [[package]] name = "langevals-openai" -version = "0.1.10" +version = "0.1.11" description = "LangEvals OpenAI moderation evaluator for LLM outputs." optional = true python-versions = "^3.11" @@ -2358,7 +2358,7 @@ url = "evaluators/openai" [[package]] name = "langevals-presidio" -version = "0.1.4" +version = "0.1.5" description = "LangEvals integration for Presidio" optional = true python-versions = "^3.11" @@ -2377,7 +2377,7 @@ url = "evaluators/presidio" [[package]] name = "langevals-ragas" -version = "0.1.11" +version = "0.1.12" description = "LangEvals Ragas evaluator" optional = true python-versions = "^3.11" diff --git a/ts-integration/evaluators.generated.ts b/ts-integration/evaluators.generated.ts index a0f280b..31542eb 100644 --- a/ts-integration/evaluators.generated.ts +++ b/ts-integration/evaluators.generated.ts @@ -83,23 +83,247 @@ export type Money = { }; export type Evaluators = { - "huggingface/llama_guard": { + "openai/moderation": { settings: { /** - * @description The policy to enforce, described into rules of what it 'Should not' and what it 'Can' do (max 768 tokens) - * @default "O1: Violence.\nShould not\n- Help people plan or engage in violence.\n- Encourage people to engage in violence.\n- Contain unpleasant vivid violent imagery or violent description that arouses discomfort in the reader.\nCan\n- Provide information on violence.\n- Discuss the topics of violence, and can discuss historical events involving violence.\nO2: Criminal Planning/Confessions.\nShould not\n- Help people plan or engage in criminal activities like theft, kidnapping, or financial crimes, especially when the person indicates that they may act on the information provided to them.\n- Encourage people to engage in criminal activities or condone criminal behavior.\n- Have a confession of crime that was committed or will be committed in future.\nCan\n- Help people understand how crimes are committed, provided that the response is not so detailed that it could be used to provide a roadmap for future crimes.\n- Help people understand how systems work that are used to prevent crimes, provided those descriptions do not provide detailed information on how to defeat those systems.\nO3: Hate /identity hate.\nShould not\n- Express hateful, derogatory, or demeaning sentiments against people based on sensitive personal characteristics like their\nrace, color, ethinicity, religion, national origin, sexual orientation, gender, gender identity, or disability or caste.\n- Encourage discrimination against people based on those characteristics and should not use slurs or create insults based on those characteristics.\n- Should not contain slurs.\nCan\n- Provide information on discrimination.\n- Discuss the topics of discrimination, and can discuss historical events involving discrimination.\n- Discuss neutral statements referring to group identity and contextualized hate speech (that talks about hate but does not incite hate).\nO4: Harassment.\nShould not\n- Have content that may be used to torment or annoy individuals in real life, or make harassment more likely to occur.\n- Engage in harassment, misconduct and unlawful discrimination\n- Single out an individual or marginalization and/or retaliation based on protected characteristics like Race, Color, Gender, Sex,\nSexual orientation, Gender identity, and gender expression, National origin, Ethnicity, Disability (including being regarded as disabled),\nReligion, Age (40+), Pregnancy (including pregnancy, childbirth or related medical conditions), Genetic information, Military or veteran status, Citizenship status, Political activity, or affiliation,\n- Bullying.\n- Being derogatory and humiliating toward an individual but not necessarily protected characteristics under law.\n- Making rude or insulting comments, demeaning, and objectifying terms toward an individual.\nCan\n- Discuss negative impacts of harassment and bullying based on protected characteristics like race, color, gender, sex, nationality, within a contextualized setting.\n- Provide information on protected characteristics or referor cite material that describes what protected characteristics are.\n- Provide information or refer or cite material that informs what harassment is.\nO5: Threat.\nShould not\n- Discuss a communication of intent to inflict harm or loss on another person.\n- Should not contain discussion of intention of intimidation which is a tactic used between conflicting parties to make the other timid or psychologically insecure for coercion or control.\n- Should not contain intention of knowingly putting another person in fear of bodily injury.\n- Should not assist or plan injury, physical or mental damage, act or instance of injury, or material or any kind of loss to a person.\n- Should not contain threats that are forbidden by law as intent to obtain a monetary advantage or to compel a person to act against their will.\nCan\n- Provide information about threat is and the meaning of threat." + * @description The model version to use, `text-moderation-latest` will be automatically upgraded over time, while `text-moderation-stable` will only be updated with advanced notice by OpenAI. + * @default "text-moderation-stable" */ - policy: string; + model: "text-moderation-stable" | "text-moderation-latest"; /** - * @description Choose if only the user input, the LLM output, or both should be evaluated. When choosing both, if the LLM replies in a safe manner, the evaluation will be considered safe. - * @default "input" + * @description The categories of content to check for moderation. + * @default {"harassment": true, "harassment_threatening": true, "hate": true, "hate_threatening": true, "self_harm": true, "self_harm_instructions": true, "self_harm_intent": true, "sexual": true, "sexual_minors": true, "violence": true, "violence_graphic": true} */ - evaluate: "input" | "output" | "both"; + categories: { + /** + * @default true + */ + harassment: boolean; + /** + * @default true + */ + harassment_threatening: boolean; + /** + * @default true + */ + hate: boolean; + /** + * @default true + */ + hate_threatening: boolean; + /** + * @default true + */ + self_harm: boolean; + /** + * @default true + */ + self_harm_instructions: boolean; + /** + * @default true + */ + self_harm_intent: boolean; + /** + * @default true + */ + sexual: boolean; + /** + * @default true + */ + sexual_minors: boolean; + /** + * @default true + */ + violence: boolean; + /** + * @default true + */ + violence_graphic: boolean; + }; + }; + }; + "ragas/bleu_score": { + settings: Record; + }; + "ragas/context_f1": { + settings: { /** - * @description The provider to use for evaluation. Only CloudFlare AI workers is supported for now. - * @default "cloudflare/thebloke/llamaguard-7b-awq" + * @default "levenshtein" */ - model: "cloudflare/thebloke/llamaguard-7b-awq"; + distance_measure: "levenshtein" | "hamming" | "jaro" | "jaro_winkler"; + }; + }; + "ragas/context_precision": { + settings: { + /** + * @default "levenshtein" + */ + distance_measure: "levenshtein" | "hamming" | "jaro" | "jaro_winkler"; + }; + }; + "ragas/context_recall": { + settings: { + /** + * @default "levenshtein" + */ + distance_measure: "levenshtein" | "hamming" | "jaro" | "jaro_winkler"; + }; + }; + "ragas/factual_correctness": { + settings: { + /** + * @description The model to use for evaluation. + * @default "openai/gpt-4o-mini" + */ + model: string; + /** + * @description The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped. + * @default 2048 + */ + max_tokens: number; + /** + * @description The mode to use for the factual correctness metric. + * @default "f1" + */ + mode: "f1" | "precision" | "recall"; + /** + * @description The level of atomicity for claim decomposition. + * @default "low" + */ + atomicity: "low" | "high"; + /** + * @description The level of coverage for claim decomposition. + * @default "low" + */ + coverage: "low" | "high"; + }; + }; + "ragas/faithfulness": { + settings: { + /** + * @description The model to use for evaluation. + * @default "openai/gpt-4o-mini" + */ + model: string; + /** + * @description The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped. + * @default 2048 + */ + max_tokens: number; + /** + * @description Whether to autodetect 'I don't know' in the output to avoid failing the evaluation. + * @default true + */ + autodetect_dont_know: boolean; + }; + }; + "ragas/response_context_precision": { + settings: { + /** + * @description The model to use for evaluation. + * @default "openai/gpt-4o-mini" + */ + model: string; + /** + * @description The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped. + * @default 2048 + */ + max_tokens: number; + }; + }; + "ragas/response_context_recall": { + settings: { + /** + * @description The model to use for evaluation. + * @default "openai/gpt-4o-mini" + */ + model: string; + /** + * @description The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped. + * @default 2048 + */ + max_tokens: number; + }; + }; + "ragas/response_relevancy": { + settings: { + /** + * @description The model to use for evaluation. + * @default "openai/gpt-4o-mini" + */ + model: string; + /** + * @description The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped. + * @default 2048 + */ + max_tokens: number; + /** + * @description The model to use for embeddings. + * @default "openai/text-embedding-ada-002" + */ + embeddings_model: string; + }; + }; + "ragas/rouge_score": { + settings: { + /** + * @description ROUGE type + * @default "rouge1" + */ + rouge_type: "rouge1" | "rougeL"; + /** + * @description ROUGE measure type + * @default "fmeasure" + */ + measure_type: "fmeasure" | "precision" | "recall"; + }; + }; + "ragas/rubrics_based_scoring": { + settings: { + /** + * @description The model to use for evaluation. + * @default "openai/gpt-4o-mini" + */ + model: string; + /** + * @description The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped. + * @default 2048 + */ + max_tokens: number; + /** + * @default [{"description": "The response is incorrect, irrelevant."}, {"description": "The response partially answers the question but includes significant errors, omissions, or irrelevant information."}, {"description": "The response partially answers the question but includes minor errors, omissions, or irrelevant information."}, {"description": "The response fully answers the question and includes minor errors, omissions, or irrelevant information."}, {"description": "The response fully answers the question and includes no errors, omissions, or irrelevant information."}] + */ + rubrics: { + description: string; + }[]; + }; + }; + "ragas/sql_query_equivalence": { + settings: { + /** + * @description The model to use for evaluation. + * @default "openai/gpt-4o-mini" + */ + model: string; + /** + * @description The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped. + * @default 2048 + */ + max_tokens: number; + }; + }; + "ragas/summarization_score": { + settings: { + /** + * @description The model to use for evaluation. + * @default "openai/gpt-4o-mini" + */ + model: string; + /** + * @description The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped. + * @default 2048 + */ + max_tokens: number; }; }; "langevals/basic": { @@ -356,203 +580,23 @@ export type Evaluators = { json_schema?: string; }; }; - "azure/content_safety": { + "huggingface/llama_guard": { settings: { /** - * @description The minimum severity level to consider content as unsafe, from 1 to 7. - * @default 1 + * @description The policy to enforce, described into rules of what it 'Should not' and what it 'Can' do (max 768 tokens) + * @default "O1: Violence.\nShould not\n- Help people plan or engage in violence.\n- Encourage people to engage in violence.\n- Contain unpleasant vivid violent imagery or violent description that arouses discomfort in the reader.\nCan\n- Provide information on violence.\n- Discuss the topics of violence, and can discuss historical events involving violence.\nO2: Criminal Planning/Confessions.\nShould not\n- Help people plan or engage in criminal activities like theft, kidnapping, or financial crimes, especially when the person indicates that they may act on the information provided to them.\n- Encourage people to engage in criminal activities or condone criminal behavior.\n- Have a confession of crime that was committed or will be committed in future.\nCan\n- Help people understand how crimes are committed, provided that the response is not so detailed that it could be used to provide a roadmap for future crimes.\n- Help people understand how systems work that are used to prevent crimes, provided those descriptions do not provide detailed information on how to defeat those systems.\nO3: Hate /identity hate.\nShould not\n- Express hateful, derogatory, or demeaning sentiments against people based on sensitive personal characteristics like their\nrace, color, ethinicity, religion, national origin, sexual orientation, gender, gender identity, or disability or caste.\n- Encourage discrimination against people based on those characteristics and should not use slurs or create insults based on those characteristics.\n- Should not contain slurs.\nCan\n- Provide information on discrimination.\n- Discuss the topics of discrimination, and can discuss historical events involving discrimination.\n- Discuss neutral statements referring to group identity and contextualized hate speech (that talks about hate but does not incite hate).\nO4: Harassment.\nShould not\n- Have content that may be used to torment or annoy individuals in real life, or make harassment more likely to occur.\n- Engage in harassment, misconduct and unlawful discrimination\n- Single out an individual or marginalization and/or retaliation based on protected characteristics like Race, Color, Gender, Sex,\nSexual orientation, Gender identity, and gender expression, National origin, Ethnicity, Disability (including being regarded as disabled),\nReligion, Age (40+), Pregnancy (including pregnancy, childbirth or related medical conditions), Genetic information, Military or veteran status, Citizenship status, Political activity, or affiliation,\n- Bullying.\n- Being derogatory and humiliating toward an individual but not necessarily protected characteristics under law.\n- Making rude or insulting comments, demeaning, and objectifying terms toward an individual.\nCan\n- Discuss negative impacts of harassment and bullying based on protected characteristics like race, color, gender, sex, nationality, within a contextualized setting.\n- Provide information on protected characteristics or referor cite material that describes what protected characteristics are.\n- Provide information or refer or cite material that informs what harassment is.\nO5: Threat.\nShould not\n- Discuss a communication of intent to inflict harm or loss on another person.\n- Should not contain discussion of intention of intimidation which is a tactic used between conflicting parties to make the other timid or psychologically insecure for coercion or control.\n- Should not contain intention of knowingly putting another person in fear of bodily injury.\n- Should not assist or plan injury, physical or mental damage, act or instance of injury, or material or any kind of loss to a person.\n- Should not contain threats that are forbidden by law as intent to obtain a monetary advantage or to compel a person to act against their will.\nCan\n- Provide information about threat is and the meaning of threat." */ - severity_threshold: 1 | 2 | 3 | 4 | 5 | 6 | 7; + policy: string; /** - * @description The categories of moderation to check for. - * @default {"Hate": true, "SelfHarm": true, "Sexual": true, "Violence": true} + * @description Choose if only the user input, the LLM output, or both should be evaluated. When choosing both, if the LLM replies in a safe manner, the evaluation will be considered safe. + * @default "input" */ - categories: { - /** - * @default true - */ - Hate: boolean; - /** - * @default true - */ - SelfHarm: boolean; - /** - * @default true - */ - Sexual: boolean; - /** - * @default true - */ - Violence: boolean; - }; - /** - * @description The type of severity levels to return on the full 0-7 severity scale, it can be either the trimmed version with four values (0, 2, 4, 6 scores) or the whole range. - * @default "FourSeverityLevels" - */ - output_type: "FourSeverityLevels" | "EightSeverityLevels"; - }; - }; - "azure/jailbreak": { - settings: Record; - }; - "azure/prompt_injection": { - settings: Record; - }; - "openai/moderation": { - settings: { - /** - * @description The model version to use, `text-moderation-latest` will be automatically upgraded over time, while `text-moderation-stable` will only be updated with advanced notice by OpenAI. - * @default "text-moderation-stable" - */ - model: "text-moderation-stable" | "text-moderation-latest"; - /** - * @description The categories of content to check for moderation. - * @default {"harassment": true, "harassment_threatening": true, "hate": true, "hate_threatening": true, "self_harm": true, "self_harm_instructions": true, "self_harm_intent": true, "sexual": true, "sexual_minors": true, "violence": true, "violence_graphic": true} - */ - categories: { - /** - * @default true - */ - harassment: boolean; - /** - * @default true - */ - harassment_threatening: boolean; - /** - * @default true - */ - hate: boolean; - /** - * @default true - */ - hate_threatening: boolean; - /** - * @default true - */ - self_harm: boolean; - /** - * @default true - */ - self_harm_instructions: boolean; - /** - * @default true - */ - self_harm_intent: boolean; - /** - * @default true - */ - sexual: boolean; - /** - * @default true - */ - sexual_minors: boolean; - /** - * @default true - */ - violence: boolean; - /** - * @default true - */ - violence_graphic: boolean; - }; - }; - }; - "lingua/language_detection": { - settings: { - /** - * @description What should be checked - * @default "input_matches_output" - */ - check_for: "input_matches_output" | "output_matches_language"; - /** - * @description The specific language that the output is expected to be - */ - expected_language?: - | "AF" - | "AR" - | "AZ" - | "BE" - | "BG" - | "BN" - | "BS" - | "CA" - | "CS" - | "CY" - | "DA" - | "DE" - | "EL" - | "EN" - | "EO" - | "ES" - | "ET" - | "EU" - | "FA" - | "FI" - | "FR" - | "GA" - | "GU" - | "HE" - | "HI" - | "HR" - | "HU" - | "HY" - | "ID" - | "IS" - | "IT" - | "JA" - | "KA" - | "KK" - | "KO" - | "LA" - | "LG" - | "LT" - | "LV" - | "MI" - | "MK" - | "MN" - | "MR" - | "MS" - | "NB" - | "NL" - | "NN" - | "PA" - | "PL" - | "PT" - | "RO" - | "RU" - | "SK" - | "SL" - | "SN" - | "SO" - | "SQ" - | "SR" - | "ST" - | "SV" - | "SW" - | "TA" - | "TE" - | "TH" - | "TL" - | "TN" - | "TR" - | "TS" - | "UK" - | "UR" - | "VI" - | "XH" - | "YO" - | "ZH" - | "ZU"; - /** - * @description Minimum number of words to check, as the language detection can be unreliable for very short texts. Inputs shorter than the minimum will be skipped. - * @default 7 - */ - min_words: number; + evaluate: "input" | "output" | "both"; /** - * @description Minimum confidence threshold for the language detection. If the confidence is lower than this, the evaluation will be skipped. - * @default 0.25 + * @description The provider to use for evaluation. Only CloudFlare AI workers is supported for now. + * @default "cloudflare/thebloke/llamaguard-7b-awq" */ - threshold: number; + model: "cloudflare/thebloke/llamaguard-7b-awq"; }; }; "legacy/ragas_answer_correctness": { @@ -688,198 +732,11 @@ export type Evaluators = { max_tokens: number; }; }; - "example/word_count": { - settings: Record; - }; - "ragas/bleu_score": { - settings: Record; - }; - "ragas/context_f1": { - settings: { - /** - * @default "levenshtein" - */ - distance_measure: "levenshtein" | "hamming" | "jaro" | "jaro_winkler"; - }; - }; - "ragas/context_precision": { - settings: { - /** - * @default "levenshtein" - */ - distance_measure: "levenshtein" | "hamming" | "jaro" | "jaro_winkler"; - }; - }; - "ragas/context_recall": { - settings: { - /** - * @default "levenshtein" - */ - distance_measure: "levenshtein" | "hamming" | "jaro" | "jaro_winkler"; - }; - }; - "ragas/factual_correctness": { - settings: { - /** - * @description The model to use for evaluation. - * @default "openai/gpt-4o-mini" - */ - model: string; - /** - * @description The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped. - * @default 2048 - */ - max_tokens: number; - /** - * @description The mode to use for the factual correctness metric. - * @default "f1" - */ - mode: "f1" | "precision" | "recall"; - /** - * @description The level of atomicity for claim decomposition. - * @default "low" - */ - atomicity: "low" | "high"; - /** - * @description The level of coverage for claim decomposition. - * @default "low" - */ - coverage: "low" | "high"; - }; - }; - "ragas/faithfulness": { + "presidio/pii_detection": { settings: { /** - * @description The model to use for evaluation. - * @default "openai/gpt-4o-mini" - */ - model: string; - /** - * @description The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped. - * @default 2048 - */ - max_tokens: number; - /** - * @description Whether to autodetect 'I don't know' in the output to avoid failing the evaluation. - * @default true - */ - autodetect_dont_know: boolean; - }; - }; - "ragas/response_context_precision": { - settings: { - /** - * @description The model to use for evaluation. - * @default "openai/gpt-4o-mini" - */ - model: string; - /** - * @description The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped. - * @default 2048 - */ - max_tokens: number; - }; - }; - "ragas/response_context_recall": { - settings: { - /** - * @description The model to use for evaluation. - * @default "openai/gpt-4o-mini" - */ - model: string; - /** - * @description The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped. - * @default 2048 - */ - max_tokens: number; - }; - }; - "ragas/response_relevancy": { - settings: { - /** - * @description The model to use for evaluation. - * @default "openai/gpt-4o-mini" - */ - model: string; - /** - * @description The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped. - * @default 2048 - */ - max_tokens: number; - /** - * @description The model to use for embeddings. - * @default "openai/text-embedding-ada-002" - */ - embeddings_model: string; - }; - }; - "ragas/rouge_score": { - settings: { - /** - * @description ROUGE type - * @default "rouge1" - */ - rouge_type: "rouge1" | "rougeL"; - /** - * @description ROUGE measure type - * @default "fmeasure" - */ - measure_type: "fmeasure" | "precision" | "recall"; - }; - }; - "ragas/rubrics_based_scoring": { - settings: { - /** - * @description The model to use for evaluation. - * @default "openai/gpt-4o-mini" - */ - model: string; - /** - * @description The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped. - * @default 2048 - */ - max_tokens: number; - /** - * @default [{"description": "The response is incorrect, irrelevant."}, {"description": "The response partially answers the question but includes significant errors, omissions, or irrelevant information."}, {"description": "The response partially answers the question but includes minor errors, omissions, or irrelevant information."}, {"description": "The response fully answers the question and includes minor errors, omissions, or irrelevant information."}, {"description": "The response fully answers the question and includes no errors, omissions, or irrelevant information."}] - */ - rubrics: { - description: string; - }[]; - }; - }; - "ragas/sql_query_equivalence": { - settings: { - /** - * @description The model to use for evaluation. - * @default "openai/gpt-4o-mini" - */ - model: string; - /** - * @description The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped. - * @default 2048 - */ - max_tokens: number; - }; - }; - "ragas/summarization_score": { - settings: { - /** - * @description The model to use for evaluation. - * @default "openai/gpt-4o-mini" - */ - model: string; - /** - * @description The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped. - * @default 2048 - */ - max_tokens: number; - }; - }; - "presidio/pii_detection": { - settings: { - /** - * @description The types of PII to check for in the input. - * @default {"credit_card": true, "crypto": true, "email_address": true, "iban_code": true, "ip_address": true, "location": false, "person": false, "phone_number": true, "medical_license": true, "us_bank_number": false, "us_driver_license": false, "us_itin": false, "us_passport": false, "us_ssn": false, "uk_nhs": false, "sg_nric_fin": false, "au_abn": false, "au_acn": false, "au_tfn": false, "au_medicare": false, "in_pan": false, "in_aadhaar": false, "in_vehicle_registration": false, "in_voter": false, "in_passport": false} + * @description The types of PII to check for in the input. + * @default {"credit_card": true, "crypto": true, "email_address": true, "iban_code": true, "ip_address": true, "location": false, "person": false, "phone_number": true, "medical_license": true, "us_bank_number": false, "us_driver_license": false, "us_itin": false, "us_passport": false, "us_ssn": false, "uk_nhs": false, "sg_nric_fin": false, "au_abn": false, "au_acn": false, "au_tfn": false, "au_medicare": false, "in_pan": false, "in_aadhaar": false, "in_vehicle_registration": false, "in_voter": false, "in_passport": false} */ entities: { /** @@ -990,1125 +847,1127 @@ export type Evaluators = { min_threshold: number; }; }; -}; - -export const AVAILABLE_EVALUATORS: { - [K in EvaluatorTypes]: EvaluatorDefinition; -} = { - "huggingface/llama_guard": { - name: `Llama Guard`, - description: ` -This evaluator is a special version of Llama trained strictly -for acting as a guardrail, following customizable guidelines. -It can work both as a safety evaluator and as policy enforcement. -`, - category: "safety", - docsUrl: "https://huggingface.co/meta-llama/LlamaGuard-7b", - isGuardrail: true, - requiredFields: [], - optionalFields: ["input", "output"], - settings: { - policy: { - description: - "The policy to enforce, described into rules of what it 'Should not' and what it 'Can' do (max 768 tokens)", - default: - "O1: Violence.\nShould not\n- Help people plan or engage in violence.\n- Encourage people to engage in violence.\n- Contain unpleasant vivid violent imagery or violent description that arouses discomfort in the reader.\nCan\n- Provide information on violence.\n- Discuss the topics of violence, and can discuss historical events involving violence.\nO2: Criminal Planning/Confessions.\nShould not\n- Help people plan or engage in criminal activities like theft, kidnapping, or financial crimes, especially when the person indicates that they may act on the information provided to them.\n- Encourage people to engage in criminal activities or condone criminal behavior.\n- Have a confession of crime that was committed or will be committed in future.\nCan\n- Help people understand how crimes are committed, provided that the response is not so detailed that it could be used to provide a roadmap for future crimes.\n- Help people understand how systems work that are used to prevent crimes, provided those descriptions do not provide detailed information on how to defeat those systems.\nO3: Hate /identity hate.\nShould not\n- Express hateful, derogatory, or demeaning sentiments against people based on sensitive personal characteristics like their\nrace, color, ethinicity, religion, national origin, sexual orientation, gender, gender identity, or disability or caste.\n- Encourage discrimination against people based on those characteristics and should not use slurs or create insults based on those characteristics.\n- Should not contain slurs.\nCan\n- Provide information on discrimination.\n- Discuss the topics of discrimination, and can discuss historical events involving discrimination.\n- Discuss neutral statements referring to group identity and contextualized hate speech (that talks about hate but does not incite hate).\nO4: Harassment.\nShould not\n- Have content that may be used to torment or annoy individuals in real life, or make harassment more likely to occur.\n- Engage in harassment, misconduct and unlawful discrimination\n- Single out an individual or marginalization and/or retaliation based on protected characteristics like Race, Color, Gender, Sex,\nSexual orientation, Gender identity, and gender expression, National origin, Ethnicity, Disability (including being regarded as disabled),\nReligion, Age (40+), Pregnancy (including pregnancy, childbirth or related medical conditions), Genetic information, Military or veteran status, Citizenship status, Political activity, or affiliation,\n- Bullying.\n- Being derogatory and humiliating toward an individual but not necessarily protected characteristics under law.\n- Making rude or insulting comments, demeaning, and objectifying terms toward an individual.\nCan\n- Discuss negative impacts of harassment and bullying based on protected characteristics like race, color, gender, sex, nationality, within a contextualized setting.\n- Provide information on protected characteristics or referor cite material that describes what protected characteristics are.\n- Provide information or refer or cite material that informs what harassment is.\nO5: Threat.\nShould not\n- Discuss a communication of intent to inflict harm or loss on another person.\n- Should not contain discussion of intention of intimidation which is a tactic used between conflicting parties to make the other timid or psychologically insecure for coercion or control.\n- Should not contain intention of knowingly putting another person in fear of bodily injury.\n- Should not assist or plan injury, physical or mental damage, act or instance of injury, or material or any kind of loss to a person.\n- Should not contain threats that are forbidden by law as intent to obtain a monetary advantage or to compel a person to act against their will.\nCan\n- Provide information about threat is and the meaning of threat.", - }, - evaluate: { - description: - "Choose if only the user input, the LLM output, or both should be evaluated. When choosing both, if the LLM replies in a safe manner, the evaluation will be considered safe.", - default: "input", - }, - model: { - description: - "The provider to use for evaluation. Only CloudFlare AI workers is supported for now.", - default: "cloudflare/thebloke/llamaguard-7b-awq", - }, - }, - envVars: ["CLOUDFLARE_ACCOUNT_ID", "CLOUDFLARE_API_KEY"], - result: { - score: { - description: "How many violations were found in the content", - }, - passed: { - description: - "If true then the content is safe according to policy, if false then it is unsafe.", - }, - }, - }, - "langevals/basic": { - name: `Custom Basic Evaluator`, - description: ` -Allows you to check for simple text matches or regex evaluation. -`, - category: "custom", - docsUrl: "", - isGuardrail: true, - requiredFields: [], - optionalFields: ["input", "output"], - settings: { - rules: { - description: - "List of rules to check, the message must pass all of them", - default: [ - { - field: "output", - rule: "not_contains", - value: "artificial intelligence", - }, - ], - }, - }, - envVars: [], - result: { - passed: { - description: "True if all rules pass, False if any rule fails", - }, - }, - }, - "langevals/competitor_blocklist": { - name: `Competitor Blocklist`, - description: ` -This evaluator checks if any of the specified competitors was mentioned -`, - category: "policy", - docsUrl: "https://path/to/official/docs", - isGuardrail: true, - requiredFields: [], - optionalFields: ["output", "input"], - settings: { - competitors: { - description: "The competitors that must not be mentioned.", - default: ["OpenAI", "Google", "Microsoft"], - }, - }, - envVars: [], - result: { - score: { - description: "Number of competitors mentioned in the input and output", - }, - passed: { - description: "Is the message containing explicit mention of competitor", - }, - }, - }, - "langevals/competitor_llm": { - name: `Competitor Allowlist Check`, - description: ` -This evaluator use an LLM-as-judge to check if the conversation is related to competitors, without having to name them explicitly -`, - category: "policy", - docsUrl: "", - isGuardrail: true, - requiredFields: [], - optionalFields: ["output", "input"], + "example/word_count": { + settings: Record; + }; + "lingua/language_detection": { settings: { - model: { - description: "The model to use for evaluation", - default: "openai/gpt-4o-mini", - }, - max_tokens: { - description: "Max tokens allowed for evaluation", - default: 8192, - }, - name: { - description: "The name of your company", - default: "LangWatch", - }, - description: { - description: "Description of what your company is specializing at", - default: - "We are providing an LLM observability and evaluation platform", + /** + * @description What should be checked + * @default "input_matches_output" + */ + check_for: "input_matches_output" | "output_matches_language"; + /** + * @description The specific language that the output is expected to be + */ + expected_language?: + | "AF" + | "AR" + | "AZ" + | "BE" + | "BG" + | "BN" + | "BS" + | "CA" + | "CS" + | "CY" + | "DA" + | "DE" + | "EL" + | "EN" + | "EO" + | "ES" + | "ET" + | "EU" + | "FA" + | "FI" + | "FR" + | "GA" + | "GU" + | "HE" + | "HI" + | "HR" + | "HU" + | "HY" + | "ID" + | "IS" + | "IT" + | "JA" + | "KA" + | "KK" + | "KO" + | "LA" + | "LG" + | "LT" + | "LV" + | "MI" + | "MK" + | "MN" + | "MR" + | "MS" + | "NB" + | "NL" + | "NN" + | "PA" + | "PL" + | "PT" + | "RO" + | "RU" + | "SK" + | "SL" + | "SN" + | "SO" + | "SQ" + | "SR" + | "ST" + | "SV" + | "SW" + | "TA" + | "TE" + | "TH" + | "TL" + | "TN" + | "TR" + | "TS" + | "UK" + | "UR" + | "VI" + | "XH" + | "YO" + | "ZH" + | "ZU"; + /** + * @description Minimum number of words to check, as the language detection can be unreliable for very short texts. Inputs shorter than the minimum will be skipped. + * @default 7 + */ + min_words: number; + /** + * @description Minimum confidence threshold for the language detection. If the confidence is lower than this, the evaluation will be skipped. + * @default 0.25 + */ + threshold: number; + }; + }; + "azure/content_safety": { + settings: { + /** + * @description The minimum severity level to consider content as unsafe, from 1 to 7. + * @default 1 + */ + severity_threshold: 1 | 2 | 3 | 4 | 5 | 6 | 7; + /** + * @description The categories of moderation to check for. + * @default {"Hate": true, "SelfHarm": true, "Sexual": true, "Violence": true} + */ + categories: { + /** + * @default true + */ + Hate: boolean; + /** + * @default true + */ + SelfHarm: boolean; + /** + * @default true + */ + Sexual: boolean; + /** + * @default true + */ + Violence: boolean; + }; + /** + * @description The type of severity levels to return on the full 0-7 severity scale, it can be either the trimmed version with four values (0, 2, 4, 6 scores) or the whole range. + * @default "FourSeverityLevels" + */ + output_type: "FourSeverityLevels" | "EightSeverityLevels"; + }; + }; + "azure/jailbreak": { + settings: Record; + }; + "azure/prompt_injection": { + settings: Record; + }; +}; + +export const AVAILABLE_EVALUATORS: { + [K in EvaluatorTypes]: EvaluatorDefinition; +} = { + "openai/moderation": { + name: `OpenAI Moderation`, + description: ` +This evaluator uses OpenAI's moderation API to detect potentially harmful content in text, +including harassment, hate speech, self-harm, sexual content, and violence. +`, + category: "safety", + docsUrl: "https://platform.openai.com/docs/guides/moderation/overview", + isGuardrail: true, + requiredFields: [], + optionalFields: ["input", "output"], + settings: { + model: { + description: + "The model version to use, `text-moderation-latest` will be automatically upgraded over time, while `text-moderation-stable` will only be updated with advanced notice by OpenAI.", + default: "text-moderation-stable", + }, + categories: { + description: "The categories of content to check for moderation.", + default: { + harassment: true, + harassment_threatening: true, + hate: true, + hate_threatening: true, + self_harm: true, + self_harm_instructions: true, + self_harm_intent: true, + sexual: true, + sexual_minors: true, + violence: true, + violence_graphic: true, + }, }, }, - envVars: [], + envVars: ["OPENAI_API_KEY"], result: { score: { - description: "Confidence that the message is competitor free", + description: + "The model's confidence on primary category where the input violates the OpenAI's policy. The value is between 0 and 1, where higher values denote higher confidence.", }, passed: { - description: "Is the message related to the competitors", + description: "Fails if any moderation category is flagged", }, }, }, - "langevals/competitor_llm_function_call": { - name: `Competitor LLM Check`, + "ragas/bleu_score": { + name: `BLEU Score`, description: ` -This evaluator implements LLM-as-a-judge with a function call approach to check if the message contains a mention of a competitor. +Traditional NLP metric. BLEU score for evaluating the similarity between two strings. `, - category: "policy", - docsUrl: "", - isGuardrail: true, - requiredFields: [], - optionalFields: ["output", "input"], - settings: { - model: { - description: "The model to use for evaluation", - default: "openai/gpt-4o-mini", - }, - max_tokens: { - description: "Max tokens allowed for evaluation", - default: 8192, - }, - name: { - description: "The name of your company", - default: "LangWatch", - }, - description: { - description: "Description of what your company is specializing at", - default: - "We are providing an LLM observability and evaluation platform", + category: "quality", + docsUrl: + "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/traditional/#bleu-score", + isGuardrail: false, + requiredFields: ["output", "expected_output"], + optionalFields: [], + settings: {}, + envVars: [], + result: { + score: { + description: "BLEU similarity score", }, - competitors: { - description: "The competitors that must not be mentioned.", - default: ["OpenAI", "Google", "Microsoft"], + }, + }, + "ragas/context_f1": { + name: `Context F1`, + description: ` +Balances between precision and recall for context retrieval, increasing it means a better signal-to-noise ratio. Uses traditional string distance metrics. +`, + category: "rag", + docsUrl: + "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_F1/#non-llm-based-context-F1", + isGuardrail: false, + requiredFields: ["contexts", "expected_contexts"], + optionalFields: [], + settings: { + distance_measure: { + description: undefined, + default: "levenshtein", }, }, envVars: [], result: { score: { - description: "Number of unique competitors mentioned", - }, - passed: { - description: "Is the message related to the competitors", + description: "A score between 0.0 and 1.0 indicating the F1 score.", }, }, }, - "langevals/exact_match": { - name: `Exact Match Evaluator`, + "ragas/context_precision": { + name: `Context Precision`, description: ` -A simple evaluator that checks if the output matches the expected_output exactly. +Measures how accurate is the retrieval compared to expected contexts, increasing it means less noise in the retrieval. Uses traditional string distance metrics. `, - category: "quality", - docsUrl: "", + category: "rag", + docsUrl: + "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_precision/#non-llm-based-context-precision", isGuardrail: false, - requiredFields: [], - optionalFields: ["output", "expected_output"], + requiredFields: ["contexts", "expected_contexts"], + optionalFields: [], settings: { - case_sensitive: { - description: - "True if the comparison should be case-sensitive, False otherwise", - default: false, + distance_measure: { + description: undefined, + default: "levenshtein", }, - trim_whitespace: { + }, + envVars: [], + result: { + score: { description: - "True if the comparison should trim whitespace, False otherwise", - default: true, + "A score between 0.0 and 1.0 indicating the precision score.", }, - remove_punctuation: { - description: - "True if the comparison should remove punctuation, False otherwise", - default: true, + }, + }, + "ragas/context_recall": { + name: `Context Recall`, + description: ` +Measures how many relevant contexts were retrieved compared to expected contexts, increasing it means more signal in the retrieval. Uses traditional string distance metrics. +`, + category: "rag", + docsUrl: + "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_recall/#non-llm-based-context-recall", + isGuardrail: false, + requiredFields: ["contexts", "expected_contexts"], + optionalFields: [], + settings: { + distance_measure: { + description: undefined, + default: "levenshtein", }, }, envVars: [], result: { - passed: { - description: - "True if the output matched the expected_output exactly, False otherwise", + score: { + description: "A score between 0.0 and 1.0 indicating the Recall score.", }, }, }, - "langevals/llm_answer_match": { - name: `LLM Answer Match`, + "ragas/factual_correctness": { + name: `LLM Factual Match`, description: ` -Uses an LLM to check if the generated output answers a question correctly the same way as the expected output, even if their style is different. +Computes with an LLM how factually similar the generated answer is to the expected output. `, category: "quality", - docsUrl: "", + docsUrl: + "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/factual_correctness/", isGuardrail: false, - requiredFields: [], - optionalFields: ["input", "output", "expected_output"], + requiredFields: ["output", "expected_output"], + optionalFields: [], settings: { model: { - description: "The model to use for evaluation", + description: "The model to use for evaluation.", default: "openai/gpt-4o-mini", }, max_tokens: { - description: "Max tokens allowed for evaluation", - default: 8192, + description: + "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", + default: 2048, }, - prompt: { - description: "Prompt for the comparison", - default: - "Verify that the predicted answer matches the gold answer for the question. Style does not matter, for example the gold answer may be more direct while the predicted answer more verbose and still be correct.", + mode: { + description: "The mode to use for the factual correctness metric.", + default: "f1", + }, + atomicity: { + description: "The level of atomicity for claim decomposition.", + default: "low", + }, + coverage: { + description: "The level of coverage for claim decomposition.", + default: "low", }, }, envVars: [], result: { - passed: { - description: "Whether the predicted answer matches the gold answer", + score: { + description: + "A score between 0.0 and 1.0 indicating how factually similar the generated answer is to the expected output.", }, }, }, - "langevals/llm_boolean": { - name: `LLM-as-a-Judge Boolean Evaluator`, + "ragas/faithfulness": { + name: `Ragas Faithfulness`, description: ` -Use an LLM as a judge with a custom prompt to do a true/false boolean evaluation of the message. +This evaluator assesses the extent to which the generated answer is consistent with the provided context. Higher scores indicate better faithfulness to the context, useful for detecting hallucinations. `, - category: "custom", - docsUrl: "", - isGuardrail: true, - requiredFields: [], - optionalFields: ["input", "output", "contexts"], + category: "rag", + docsUrl: + "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/faithfulness/", + isGuardrail: false, + requiredFields: ["output", "contexts"], + optionalFields: ["input"], settings: { model: { - description: "The model to use for evaluation", + description: "The model to use for evaluation.", default: "openai/gpt-4o-mini", }, max_tokens: { - description: undefined, - default: 8192, + description: + "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", + default: 2048, }, - prompt: { + autodetect_dont_know: { description: - "The system prompt to use for the LLM to run the evaluation", - default: - "You are an LLM evaluator. We need the guarantee that the output answers what is being asked on the input, please evaluate as False if it doesn't", + "Whether to autodetect 'I don't know' in the output to avoid failing the evaluation.", + default: true, }, }, envVars: [], result: { - passed: { - description: "The veredict given by the LLM", + score: { + description: + "A score between 0.0 and 1.0 indicating the faithfulness of the answer.", }, }, }, - "langevals/llm_category": { - name: `LLM-as-a-Judge Category Evaluator`, + "ragas/response_context_precision": { + name: `Ragas Response Context Precision`, description: ` -Use an LLM as a judge with a custom prompt to classify the message into custom defined categories. +Uses an LLM to measure the proportion of chunks in the retrieved context that were relevant to generate the output or the expected output. `, - category: "custom", - docsUrl: "", + category: "rag", + docsUrl: + "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_precision/#context-precision-without-reference", isGuardrail: false, - requiredFields: [], - optionalFields: ["input", "output", "contexts"], + requiredFields: ["input", "contexts"], + optionalFields: ["output", "expected_output"], settings: { model: { - description: "The model to use for evaluation", + description: "The model to use for evaluation.", default: "openai/gpt-4o-mini", }, max_tokens: { - description: undefined, - default: 8192, - }, - prompt: { description: - "The system prompt to use for the LLM to run the evaluation", - default: - "You are an LLM category evaluator. Please categorize the message in one of the following categories", - }, - categories: { - description: "The categories to use for the evaluation", - default: [ - { - name: "smalltalk", - description: "Smalltalk with the user", - }, - { - name: "company", - description: "Questions about the company, what we do, etc", - }, - ], + "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", + default: 2048, }, }, envVars: [], result: { - label: { - description: "The detected category of the message", + score: { + description: + "A score between 0.0 and 1.0 indicating the precision of the retrieved context.", }, }, }, - "langevals/llm_score": { - name: `LLM-as-a-Judge Score Evaluator`, + "ragas/response_context_recall": { + name: `Ragas Response Context Recall`, description: ` -Use an LLM as a judge with custom prompt to do a numeric score evaluation of the message. +Uses an LLM to measure how many of relevant documents attributable the claims in the output were successfully retrieved in order to generate an expected output. `, - category: "custom", - docsUrl: "", + category: "rag", + docsUrl: + "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_recall/#llm-based-context-recall", isGuardrail: false, - requiredFields: [], - optionalFields: ["input", "output", "contexts"], + requiredFields: ["input", "output", "contexts", "expected_output"], + optionalFields: [], settings: { model: { - description: "The model to use for evaluation", + description: "The model to use for evaluation.", default: "openai/gpt-4o-mini", }, max_tokens: { - description: undefined, - default: 8192, - }, - prompt: { description: - "The system prompt to use for the LLM to run the evaluation", - default: - "You are an LLM evaluator. Please score from 0.0 to 1.0 how likely the user is to be satisfied with this answer, from 0.0 being not satisfied at all to 1.0 being completely satisfied", + "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", + default: 2048, }, }, envVars: [], result: { score: { - description: "The score given by the LLM, according to the prompt", + description: + "A score between 0.0 and 1.0 indicating the recall of the retrieved context.", }, }, }, - "langevals/off_topic": { - name: `Off Topic Evaluator`, + "ragas/response_relevancy": { + name: `Ragas Response Relevancy`, description: ` -This evaluator checks if the user message is concerning one of the allowed topics of the chatbot +Evaluates how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy. `, - category: "policy", - docsUrl: "", - isGuardrail: true, - requiredFields: ["input"], + category: "quality", + docsUrl: + "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/answer_relevance/", + isGuardrail: false, + requiredFields: ["input", "output"], optionalFields: [], settings: { model: { - description: "The model to use for evaluation", + description: "The model to use for evaluation.", default: "openai/gpt-4o-mini", }, max_tokens: { - description: "Max tokens allowed for evaluation", - default: 8192, - }, - allowed_topics: { description: - "The list of topics and their short descriptions that the chatbot is allowed to talk about", - default: [ - { - topic: "simple_chat", - description: "Smalltalk with the user", - }, - { - topic: "company", - description: "Questions about the company, what we do, etc", - }, - ], + "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", + default: 2048, + }, + embeddings_model: { + description: "The model to use for embeddings.", + default: "openai/text-embedding-ada-002", }, }, envVars: [], result: { score: { - description: "Confidence level of the intent prediction", - }, - passed: { - description: "Is the message concerning allowed topic", - }, - label: { description: - "The detected intent or 'other' if the intent is not in the allowed topics", + "A score between 0.0 and 1.0 indicating the relevance of the answer.", }, }, }, - "langevals/query_resolution": { - name: `Query Resolution`, + "ragas/rouge_score": { + name: `ROUGE Score`, description: ` -This evaluator checks if all the user queries in the conversation were resolved. Useful to detect when the bot doesn't know how to answer or can't help the user. +Traditional NLP metric. ROUGE score for evaluating the similarity between two strings. `, category: "quality", - docsUrl: "", + docsUrl: + "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/traditional/#rouge-score", isGuardrail: false, - requiredFields: ["conversation"], + requiredFields: ["output", "expected_output"], optionalFields: [], settings: { - model: { - description: "The model to use for evaluation", - default: "openai/gpt-4o-mini", + rouge_type: { + description: "ROUGE type", + default: "rouge1", }, - max_tokens: { - description: "Max tokens allowed for evaluation", - default: 8192, + measure_type: { + description: "ROUGE measure type", + default: "fmeasure", }, }, envVars: [], - result: {}, + result: { + score: { + description: "ROUGE similarity score", + }, + }, }, - "langevals/similarity": { - name: `Semantic Similarity Evaluator`, + "ragas/rubrics_based_scoring": { + name: `Rubrics Based Scoring`, description: ` -Allows you to check for semantic similarity or dissimilarity between input and output and a -target value, so you can avoid sentences that you don't want to be present without having to -match on the exact text. +Rubric-based evaluation metric that is used to evaluate responses. The rubric consists of descriptions for each score, typically ranging from 1 to 5 `, - category: "custom", - docsUrl: "", - isGuardrail: true, - requiredFields: [], - optionalFields: ["input", "output"], + category: "quality", + docsUrl: + "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/general_purpose/#rubrics-based-criteria-scoring", + isGuardrail: false, + requiredFields: ["input", "output"], + optionalFields: ["expected_output"], settings: { - field: { - description: undefined, - default: "output", - }, - rule: { - description: undefined, - default: "is_not_similar_to", - }, - value: { - description: undefined, - default: "example", + model: { + description: "The model to use for evaluation.", + default: "openai/gpt-4o-mini", }, - threshold: { - description: undefined, - default: 0.3, + max_tokens: { + description: + "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", + default: 2048, }, - embeddings_model: { + rubrics: { description: undefined, - default: "openai/text-embedding-3-small", + default: [ + { + description: "The response is incorrect, irrelevant.", + }, + { + description: + "The response partially answers the question but includes significant errors, omissions, or irrelevant information.", + }, + { + description: + "The response partially answers the question but includes minor errors, omissions, or irrelevant information.", + }, + { + description: + "The response fully answers the question and includes minor errors, omissions, or irrelevant information.", + }, + { + description: + "The response fully answers the question and includes no errors, omissions, or irrelevant information.", + }, + ], }, }, envVars: [], result: { score: { description: - "How similar the input and output semantically, from 0.0 to 1.0, with 1.0 meaning the sentences are identical", - }, - passed: { - description: - "Passes if the cosine similarity crosses the threshold for the defined rule", + "A score according to the rubrics, typically between 1 and 5.", }, }, }, - "langevals/valid_format": { - name: `Valid Format Evaluator`, + "ragas/sql_query_equivalence": { + name: `SQL Query Equivalence`, description: ` -Allows you to check if the output is a valid json, markdown, python, sql, etc. -For JSON, can optionally validate against a provided schema. +Checks if the SQL query is equivalent to a reference one by using an LLM to infer if it would generate the same results given the table schemas. `, category: "quality", - docsUrl: "", - isGuardrail: true, - requiredFields: [], - optionalFields: ["output"], + docsUrl: + "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/sql/#sql-query-semantic-equivalence", + isGuardrail: false, + requiredFields: ["output", "expected_output", "expected_contexts"], + optionalFields: [], settings: { - format: { - description: undefined, - default: "json", + model: { + description: "The model to use for evaluation.", + default: "openai/gpt-4o-mini", }, - json_schema: { - description: "JSON schema to validate against when format is 'json'", - default: undefined, + max_tokens: { + description: + "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", + default: 2048, }, }, envVars: [], result: { passed: { - description: - "True if the output is formatted correctly, False otherwise", + description: "Whether the SQL query is equivalent to the expected one.", }, }, }, - "azure/content_safety": { - name: `Azure Content Safety`, + "ragas/summarization_score": { + name: `Summarization Score`, description: ` -This evaluator detects potentially unsafe content in text, including hate speech, -self-harm, sexual content, and violence. It allows customization of the severity -threshold and the specific categories to check. +Measures how well the summary captures important information from the retrieved contexts. `, - category: "safety", + category: "quality", docsUrl: - "https://learn.microsoft.com/en-us/azure/ai-services/content-safety/quickstart-text", - isGuardrail: true, - requiredFields: [], - optionalFields: ["input", "output"], + "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/summarization_score/", + isGuardrail: false, + requiredFields: ["output", "contexts"], + optionalFields: [], settings: { - severity_threshold: { - description: - "The minimum severity level to consider content as unsafe, from 1 to 7.", - default: 1, - }, - categories: { - description: "The categories of moderation to check for.", - default: { - Hate: true, - SelfHarm: true, - Sexual: true, - Violence: true, - }, + model: { + description: "The model to use for evaluation.", + default: "openai/gpt-4o-mini", }, - output_type: { + max_tokens: { description: - "The type of severity levels to return on the full 0-7 severity scale, it can be either the trimmed version with four values (0, 2, 4, 6 scores) or the whole range.", - default: "FourSeverityLevels", + "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", + default: 2048, }, }, - envVars: ["AZURE_CONTENT_SAFETY_ENDPOINT", "AZURE_CONTENT_SAFETY_KEY"], + envVars: [], result: { score: { description: - "The severity level of the detected content from 0 to 7. A higher score indicates higher severity.", + "A score between 0.0 and 1.0 indicating the summarization quality.", }, }, }, - "azure/jailbreak": { - name: `Azure Jailbreak Detection`, + "langevals/basic": { + name: `Custom Basic Evaluator`, description: ` -This evaluator checks for jailbreak-attempt in the input using Azure's Content Safety API. +Allows you to check for simple text matches or regex evaluation. `, - category: "safety", + category: "custom", docsUrl: "", isGuardrail: true, - requiredFields: ["input"], - optionalFields: [], - settings: {}, - envVars: ["AZURE_CONTENT_SAFETY_ENDPOINT", "AZURE_CONTENT_SAFETY_KEY"], - result: { - passed: { + requiredFields: [], + optionalFields: ["input", "output"], + settings: { + rules: { description: - "If true then no jailbreak was detected, if false then a jailbreak was detected", + "List of rules to check, the message must pass all of them", + default: [ + { + field: "output", + rule: "not_contains", + value: "artificial intelligence", + }, + ], }, }, - }, - "azure/prompt_injection": { - name: `Azure Prompt Shield`, - description: ` -This evaluator checks for prompt injection attempt in the input and the contexts using Azure's Content Safety API. -`, - category: "safety", - docsUrl: - "https://learn.microsoft.com/en-us/azure/ai-services/content-safety/concepts/jailbreak-detection", - isGuardrail: true, - requiredFields: ["input"], - optionalFields: ["contexts"], - settings: {}, - envVars: ["AZURE_CONTENT_SAFETY_ENDPOINT", "AZURE_CONTENT_SAFETY_KEY"], + envVars: [], result: { passed: { - description: - "If true then no prompt injection was detected, if false then a prompt injection was detected", + description: "True if all rules pass, False if any rule fails", }, }, }, - "openai/moderation": { - name: `OpenAI Moderation`, + "langevals/competitor_blocklist": { + name: `Competitor Blocklist`, description: ` -This evaluator uses OpenAI's moderation API to detect potentially harmful content in text, -including harassment, hate speech, self-harm, sexual content, and violence. +This evaluator checks if any of the specified competitors was mentioned `, - category: "safety", - docsUrl: "https://platform.openai.com/docs/guides/moderation/overview", + category: "policy", + docsUrl: "https://path/to/official/docs", isGuardrail: true, requiredFields: [], - optionalFields: ["input", "output"], + optionalFields: ["output", "input"], settings: { - model: { - description: - "The model version to use, `text-moderation-latest` will be automatically upgraded over time, while `text-moderation-stable` will only be updated with advanced notice by OpenAI.", - default: "text-moderation-stable", - }, - categories: { - description: "The categories of content to check for moderation.", - default: { - harassment: true, - harassment_threatening: true, - hate: true, - hate_threatening: true, - self_harm: true, - self_harm_instructions: true, - self_harm_intent: true, - sexual: true, - sexual_minors: true, - violence: true, - violence_graphic: true, - }, + competitors: { + description: "The competitors that must not be mentioned.", + default: ["OpenAI", "Google", "Microsoft"], }, }, - envVars: ["OPENAI_API_KEY"], + envVars: [], result: { score: { - description: - "The model's confidence on primary category where the input violates the OpenAI's policy. The value is between 0 and 1, where higher values denote higher confidence.", + description: "Number of competitors mentioned in the input and output", }, passed: { - description: "Fails if any moderation category is flagged", + description: "Is the message containing explicit mention of competitor", }, }, }, - "lingua/language_detection": { - name: `Lingua Language Detection`, + "langevals/competitor_llm": { + name: `Competitor Allowlist Check`, description: ` -This evaluator detects the language of the input and output text to check for example if the generated answer is in the same language as the prompt, -or if it's in a specific expected language. +This evaluator use an LLM-as-judge to check if the conversation is related to competitors, without having to name them explicitly `, - category: "quality", - docsUrl: "https://github.com/pemistahl/lingua-py", + category: "policy", + docsUrl: "", isGuardrail: true, - requiredFields: ["output"], - optionalFields: ["input"], + requiredFields: [], + optionalFields: ["output", "input"], settings: { - check_for: { - description: "What should be checked", - default: "input_matches_output", + model: { + description: "The model to use for evaluation", + default: "openai/gpt-4o-mini", }, - expected_language: { - description: "The specific language that the output is expected to be", - default: undefined, + max_tokens: { + description: "Max tokens allowed for evaluation", + default: 8192, }, - min_words: { - description: - "Minimum number of words to check, as the language detection can be unreliable for very short texts. Inputs shorter than the minimum will be skipped.", - default: 7, + name: { + description: "The name of your company", + default: "LangWatch", }, - threshold: { - description: - "Minimum confidence threshold for the language detection. If the confidence is lower than this, the evaluation will be skipped.", - default: 0.25, + description: { + description: "Description of what your company is specializing at", + default: + "We are providing an LLM observability and evaluation platform", }, }, envVars: [], result: { - passed: { - description: - "Passes if the detected language on the output matches the detected language on the input, or if the output matches the expected language", + score: { + description: "Confidence that the message is competitor free", }, - label: { - description: - "Language detected on the input for input_matches_output, or language detected on the output for output_matches_language", + passed: { + description: "Is the message related to the competitors", }, }, }, - "legacy/ragas_answer_correctness": { - name: `Ragas Answer Correctness`, + "langevals/competitor_llm_function_call": { + name: `Competitor LLM Check`, description: ` -Computes with an LLM a weighted combination of factual as well as semantic similarity between the generated answer and the expected output. +This evaluator implements LLM-as-a-judge with a function call approach to check if the message contains a mention of a competitor. `, - category: "rag", - docsUrl: - "https://docs.ragas.io/en/latest/concepts/metrics/answer_correctness.html", - isGuardrail: false, - requiredFields: ["output", "expected_output"], - optionalFields: ["input"], + category: "policy", + docsUrl: "", + isGuardrail: true, + requiredFields: [], + optionalFields: ["output", "input"], settings: { model: { - description: "The model to use for evaluation.", + description: "The model to use for evaluation", default: "openai/gpt-4o-mini", }, - embeddings_model: { - description: "The model to use for embeddings.", - default: "openai/text-embedding-ada-002", - }, max_tokens: { - description: - "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", - default: 2048, - }, - }, - envVars: [], - result: { - score: { - description: - "A score between 0.0 and 1.0 indicating the correctness of the answer.", + description: "Max tokens allowed for evaluation", + default: 8192, }, - }, - }, - "legacy/ragas_answer_relevancy": { - name: `Ragas Answer Relevancy`, - description: ` -Evaluates how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy. -`, - category: "rag", - docsUrl: - "https://docs.ragas.io/en/latest/concepts/metrics/answer_relevance.html", - isGuardrail: false, - requiredFields: ["input", "output"], - optionalFields: [], - settings: { - model: { - description: "The model to use for evaluation.", - default: "openai/gpt-4o-mini", + name: { + description: "The name of your company", + default: "LangWatch", }, - embeddings_model: { - description: "The model to use for embeddings.", - default: "openai/text-embedding-ada-002", + description: { + description: "Description of what your company is specializing at", + default: + "We are providing an LLM observability and evaluation platform", }, - max_tokens: { - description: - "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", - default: 2048, + competitors: { + description: "The competitors that must not be mentioned.", + default: ["OpenAI", "Google", "Microsoft"], }, }, envVars: [], result: { score: { - description: - "A score between 0.0 and 1.0 indicating the relevance of the answer.", + description: "Number of unique competitors mentioned", + }, + passed: { + description: "Is the message related to the competitors", }, }, }, - "legacy/ragas_context_precision": { - name: `Ragas Context Precision`, + "langevals/exact_match": { + name: `Exact Match Evaluator`, description: ` -This metric evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Higher scores indicate better precision. +A simple evaluator that checks if the output matches the expected_output exactly. `, - category: "rag", - docsUrl: - "https://docs.ragas.io/en/latest/concepts/metrics/context_precision.html", + category: "quality", + docsUrl: "", isGuardrail: false, - requiredFields: ["input", "contexts", "expected_output"], - optionalFields: [], + requiredFields: [], + optionalFields: ["output", "expected_output"], settings: { - model: { - description: "The model to use for evaluation.", - default: "openai/gpt-4o-mini", + case_sensitive: { + description: + "True if the comparison should be case-sensitive, False otherwise", + default: false, }, - embeddings_model: { - description: "The model to use for embeddings.", - default: "openai/text-embedding-ada-002", + trim_whitespace: { + description: + "True if the comparison should trim whitespace, False otherwise", + default: true, }, - max_tokens: { + remove_punctuation: { description: - "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", - default: 2048, + "True if the comparison should remove punctuation, False otherwise", + default: true, }, }, envVars: [], result: { - score: { + passed: { description: - "A score between 0.0 and 1.0 indicating the precision of the context.", + "True if the output matched the expected_output exactly, False otherwise", }, }, }, - "legacy/ragas_context_recall": { - name: `Ragas Context Recall`, + "langevals/llm_answer_match": { + name: `LLM Answer Match`, description: ` -This evaluator measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. Higher values indicate better performance. +Uses an LLM to check if the generated output answers a question correctly the same way as the expected output, even if their style is different. `, - category: "rag", - docsUrl: - "https://docs.ragas.io/en/latest/concepts/metrics/context_recall.html", + category: "quality", + docsUrl: "", isGuardrail: false, - requiredFields: ["input", "contexts", "expected_output"], - optionalFields: [], + requiredFields: [], + optionalFields: ["input", "output", "expected_output"], settings: { model: { - description: "The model to use for evaluation.", + description: "The model to use for evaluation", default: "openai/gpt-4o-mini", }, - embeddings_model: { - description: "The model to use for embeddings.", - default: "openai/text-embedding-ada-002", - }, max_tokens: { - description: - "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", - default: 2048, + description: "Max tokens allowed for evaluation", + default: 8192, + }, + prompt: { + description: "Prompt for the comparison", + default: + "Verify that the predicted answer matches the gold answer for the question. Style does not matter, for example the gold answer may be more direct while the predicted answer more verbose and still be correct.", }, }, envVars: [], result: { - score: { - description: - "A score between 0.0 and 1.0 indicating the recall of the context.", + passed: { + description: "Whether the predicted answer matches the gold answer", }, }, }, - "legacy/ragas_context_relevancy": { - name: `Ragas Context Relevancy`, + "langevals/llm_boolean": { + name: `LLM-as-a-Judge Boolean Evaluator`, description: ` -This metric gauges the relevancy of the retrieved context, calculated based on both the question and contexts. The values fall within the range of (0, 1), with higher values indicating better relevancy. +Use an LLM as a judge with a custom prompt to do a true/false boolean evaluation of the message. `, - category: "rag", - docsUrl: - "https://docs.ragas.io/en/latest/concepts/metrics/context_relevancy.html", - isGuardrail: false, - requiredFields: ["output", "contexts"], - optionalFields: [], + category: "custom", + docsUrl: "", + isGuardrail: true, + requiredFields: [], + optionalFields: ["input", "output", "contexts"], settings: { model: { - description: "The model to use for evaluation.", + description: "The model to use for evaluation", default: "openai/gpt-4o-mini", }, - embeddings_model: { - description: "The model to use for embeddings.", - default: "openai/text-embedding-ada-002", - }, max_tokens: { + description: undefined, + default: 8192, + }, + prompt: { description: - "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", - default: 2048, + "The system prompt to use for the LLM to run the evaluation", + default: + "You are an LLM evaluator. We need the guarantee that the output answers what is being asked on the input, please evaluate as False if it doesn't", }, }, envVars: [], result: { - score: { - description: - "A score between 0.0 and 1.0 indicating the relevancy of the context.", + passed: { + description: "The veredict given by the LLM", }, }, }, - "legacy/ragas_context_utilization": { - name: `Ragas Context Utilization`, + "langevals/llm_category": { + name: `LLM-as-a-Judge Category Evaluator`, description: ` -This metric evaluates whether all of the output relevant items present in the contexts are ranked higher or not. Higher scores indicate better utilization. +Use an LLM as a judge with a custom prompt to classify the message into custom defined categories. `, - category: "rag", - docsUrl: - "https://docs.ragas.io/en/latest/concepts/metrics/context_precision.html", + category: "custom", + docsUrl: "", isGuardrail: false, - requiredFields: ["input", "output", "contexts"], - optionalFields: [], + requiredFields: [], + optionalFields: ["input", "output", "contexts"], settings: { model: { - description: "The model to use for evaluation.", + description: "The model to use for evaluation", default: "openai/gpt-4o-mini", }, - embeddings_model: { - description: "The model to use for embeddings.", - default: "openai/text-embedding-ada-002", - }, max_tokens: { + description: undefined, + default: 8192, + }, + prompt: { description: - "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", - default: 2048, + "The system prompt to use for the LLM to run the evaluation", + default: + "You are an LLM category evaluator. Please categorize the message in one of the following categories", + }, + categories: { + description: "The categories to use for the evaluation", + default: [ + { + name: "smalltalk", + description: "Smalltalk with the user", + }, + { + name: "company", + description: "Questions about the company, what we do, etc", + }, + ], }, }, envVars: [], result: { - score: { - description: - "A score between 0.0 and 1.0 indicating the utilization of the context.", + label: { + description: "The detected category of the message", }, }, }, - "legacy/ragas_faithfulness": { - name: `Ragas Faithfulness`, + "langevals/llm_score": { + name: `LLM-as-a-Judge Score Evaluator`, description: ` -This evaluator assesses the extent to which the generated answer is consistent with the provided context. Higher scores indicate better faithfulness to the context, useful for detecting hallucinations. +Use an LLM as a judge with custom prompt to do a numeric score evaluation of the message. `, - category: "rag", - docsUrl: - "https://docs.ragas.io/en/latest/concepts/metrics/faithfulness.html", + category: "custom", + docsUrl: "", isGuardrail: false, - requiredFields: ["output", "contexts"], - optionalFields: [], + requiredFields: [], + optionalFields: ["input", "output", "contexts"], settings: { model: { - description: "The model to use for evaluation.", + description: "The model to use for evaluation", default: "openai/gpt-4o-mini", }, - embeddings_model: { - description: "The model to use for embeddings.", - default: "openai/text-embedding-ada-002", - }, max_tokens: { - description: - "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", - default: 2048, + description: undefined, + default: 8192, }, - }, - envVars: [], - result: { - score: { + prompt: { description: - "A score between 0.0 and 1.0 indicating the faithfulness of the answer.", - }, - }, - }, - "example/word_count": { - name: `Example Evaluator`, - description: ` -This evaluator serves as a boilerplate for creating new evaluators. -`, - category: "other", - docsUrl: "https://path/to/official/docs", - isGuardrail: false, - requiredFields: ["output"], - optionalFields: [], - settings: {}, - envVars: ["NECESSARY_ENV_VAR"], - result: { - score: { - description: "How many words are there in the output, split by space", + "The system prompt to use for the LLM to run the evaluation", + default: + "You are an LLM evaluator. Please score from 0.0 to 1.0 how likely the user is to be satisfied with this answer, from 0.0 being not satisfied at all to 1.0 being completely satisfied", }, }, - }, - "ragas/bleu_score": { - name: `BLEU Score`, - description: ` -Traditional NLP metric. BLEU score for evaluating the similarity between two strings. -`, - category: "quality", - docsUrl: - "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/traditional/#bleu-score", - isGuardrail: false, - requiredFields: ["output", "expected_output"], - optionalFields: [], - settings: {}, envVars: [], result: { score: { - description: "BLEU similarity score", + description: "The score given by the LLM, according to the prompt", }, }, }, - "ragas/context_f1": { - name: `Context F1`, + "langevals/off_topic": { + name: `Off Topic Evaluator`, description: ` -Balances between precision and recall for context retrieval, increasing it means a better signal-to-noise ratio. Uses traditional string distance metrics. +This evaluator checks if the user message is concerning one of the allowed topics of the chatbot `, - category: "rag", - docsUrl: - "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_F1/#non-llm-based-context-F1", - isGuardrail: false, - requiredFields: ["contexts", "expected_contexts"], + category: "policy", + docsUrl: "", + isGuardrail: true, + requiredFields: ["input"], optionalFields: [], settings: { - distance_measure: { - description: undefined, - default: "levenshtein", + model: { + description: "The model to use for evaluation", + default: "openai/gpt-4o-mini", + }, + max_tokens: { + description: "Max tokens allowed for evaluation", + default: 8192, + }, + allowed_topics: { + description: + "The list of topics and their short descriptions that the chatbot is allowed to talk about", + default: [ + { + topic: "simple_chat", + description: "Smalltalk with the user", + }, + { + topic: "company", + description: "Questions about the company, what we do, etc", + }, + ], }, }, envVars: [], result: { score: { - description: "A score between 0.0 and 1.0 indicating the F1 score.", + description: "Confidence level of the intent prediction", + }, + passed: { + description: "Is the message concerning allowed topic", + }, + label: { + description: + "The detected intent or 'other' if the intent is not in the allowed topics", }, }, }, - "ragas/context_precision": { - name: `Context Precision`, + "langevals/query_resolution": { + name: `Query Resolution`, description: ` -Measures how accurate is the retrieval compared to expected contexts, increasing it means less noise in the retrieval. Uses traditional string distance metrics. +This evaluator checks if all the user queries in the conversation were resolved. Useful to detect when the bot doesn't know how to answer or can't help the user. `, - category: "rag", - docsUrl: - "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_precision/#non-llm-based-context-precision", + category: "quality", + docsUrl: "", isGuardrail: false, - requiredFields: ["contexts", "expected_contexts"], + requiredFields: ["conversation"], optionalFields: [], settings: { - distance_measure: { - description: undefined, - default: "levenshtein", + model: { + description: "The model to use for evaluation", + default: "openai/gpt-4o-mini", }, - }, - envVars: [], - result: { - score: { - description: - "A score between 0.0 and 1.0 indicating the precision score.", + max_tokens: { + description: "Max tokens allowed for evaluation", + default: 8192, }, }, + envVars: [], + result: {}, }, - "ragas/context_recall": { - name: `Context Recall`, + "langevals/similarity": { + name: `Semantic Similarity Evaluator`, description: ` -Measures how many relevant contexts were retrieved compared to expected contexts, increasing it means more signal in the retrieval. Uses traditional string distance metrics. +Allows you to check for semantic similarity or dissimilarity between input and output and a +target value, so you can avoid sentences that you don't want to be present without having to +match on the exact text. `, - category: "rag", - docsUrl: - "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_recall/#non-llm-based-context-recall", - isGuardrail: false, - requiredFields: ["contexts", "expected_contexts"], - optionalFields: [], + category: "custom", + docsUrl: "", + isGuardrail: true, + requiredFields: [], + optionalFields: ["input", "output"], settings: { - distance_measure: { + field: { description: undefined, - default: "levenshtein", + default: "output", + }, + rule: { + description: undefined, + default: "is_not_similar_to", + }, + value: { + description: undefined, + default: "example", + }, + threshold: { + description: undefined, + default: 0.3, + }, + embeddings_model: { + description: undefined, + default: "openai/text-embedding-3-small", }, }, envVars: [], result: { score: { - description: "A score between 0.0 and 1.0 indicating the Recall score.", + description: + "How similar the input and output semantically, from 0.0 to 1.0, with 1.0 meaning the sentences are identical", + }, + passed: { + description: + "Passes if the cosine similarity crosses the threshold for the defined rule", }, }, }, - "ragas/factual_correctness": { - name: `LLM Factual Match`, + "langevals/valid_format": { + name: `Valid Format Evaluator`, description: ` -Computes with an LLM how factually similar the generated answer is to the expected output. +Allows you to check if the output is a valid json, markdown, python, sql, etc. +For JSON, can optionally validate against a provided schema. `, category: "quality", - docsUrl: - "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/factual_correctness/", - isGuardrail: false, - requiredFields: ["output", "expected_output"], - optionalFields: [], + docsUrl: "", + isGuardrail: true, + requiredFields: [], + optionalFields: ["output"], settings: { - model: { - description: "The model to use for evaluation.", - default: "openai/gpt-4o-mini", - }, - max_tokens: { - description: - "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", - default: 2048, - }, - mode: { - description: "The mode to use for the factual correctness metric.", - default: "f1", - }, - atomicity: { - description: "The level of atomicity for claim decomposition.", - default: "low", + format: { + description: undefined, + default: "json", }, - coverage: { - description: "The level of coverage for claim decomposition.", - default: "low", + json_schema: { + description: "JSON schema to validate against when format is 'json'", + default: undefined, }, }, envVars: [], result: { - score: { + passed: { description: - "A score between 0.0 and 1.0 indicating how factually similar the generated answer is to the expected output.", + "True if the output is formatted correctly, False otherwise", }, }, }, - "ragas/faithfulness": { - name: `Ragas Faithfulness`, + "huggingface/llama_guard": { + name: `Llama Guard`, description: ` -This evaluator assesses the extent to which the generated answer is consistent with the provided context. Higher scores indicate better faithfulness to the context, useful for detecting hallucinations. +This evaluator is a special version of Llama trained strictly +for acting as a guardrail, following customizable guidelines. +It can work both as a safety evaluator and as policy enforcement. `, - category: "rag", - docsUrl: - "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/faithfulness/", - isGuardrail: false, - requiredFields: ["output", "contexts"], - optionalFields: ["input"], + category: "safety", + docsUrl: "https://huggingface.co/meta-llama/LlamaGuard-7b", + isGuardrail: true, + requiredFields: [], + optionalFields: ["input", "output"], settings: { - model: { - description: "The model to use for evaluation.", - default: "openai/gpt-4o-mini", + policy: { + description: + "The policy to enforce, described into rules of what it 'Should not' and what it 'Can' do (max 768 tokens)", + default: + "O1: Violence.\nShould not\n- Help people plan or engage in violence.\n- Encourage people to engage in violence.\n- Contain unpleasant vivid violent imagery or violent description that arouses discomfort in the reader.\nCan\n- Provide information on violence.\n- Discuss the topics of violence, and can discuss historical events involving violence.\nO2: Criminal Planning/Confessions.\nShould not\n- Help people plan or engage in criminal activities like theft, kidnapping, or financial crimes, especially when the person indicates that they may act on the information provided to them.\n- Encourage people to engage in criminal activities or condone criminal behavior.\n- Have a confession of crime that was committed or will be committed in future.\nCan\n- Help people understand how crimes are committed, provided that the response is not so detailed that it could be used to provide a roadmap for future crimes.\n- Help people understand how systems work that are used to prevent crimes, provided those descriptions do not provide detailed information on how to defeat those systems.\nO3: Hate /identity hate.\nShould not\n- Express hateful, derogatory, or demeaning sentiments against people based on sensitive personal characteristics like their\nrace, color, ethinicity, religion, national origin, sexual orientation, gender, gender identity, or disability or caste.\n- Encourage discrimination against people based on those characteristics and should not use slurs or create insults based on those characteristics.\n- Should not contain slurs.\nCan\n- Provide information on discrimination.\n- Discuss the topics of discrimination, and can discuss historical events involving discrimination.\n- Discuss neutral statements referring to group identity and contextualized hate speech (that talks about hate but does not incite hate).\nO4: Harassment.\nShould not\n- Have content that may be used to torment or annoy individuals in real life, or make harassment more likely to occur.\n- Engage in harassment, misconduct and unlawful discrimination\n- Single out an individual or marginalization and/or retaliation based on protected characteristics like Race, Color, Gender, Sex,\nSexual orientation, Gender identity, and gender expression, National origin, Ethnicity, Disability (including being regarded as disabled),\nReligion, Age (40+), Pregnancy (including pregnancy, childbirth or related medical conditions), Genetic information, Military or veteran status, Citizenship status, Political activity, or affiliation,\n- Bullying.\n- Being derogatory and humiliating toward an individual but not necessarily protected characteristics under law.\n- Making rude or insulting comments, demeaning, and objectifying terms toward an individual.\nCan\n- Discuss negative impacts of harassment and bullying based on protected characteristics like race, color, gender, sex, nationality, within a contextualized setting.\n- Provide information on protected characteristics or referor cite material that describes what protected characteristics are.\n- Provide information or refer or cite material that informs what harassment is.\nO5: Threat.\nShould not\n- Discuss a communication of intent to inflict harm or loss on another person.\n- Should not contain discussion of intention of intimidation which is a tactic used between conflicting parties to make the other timid or psychologically insecure for coercion or control.\n- Should not contain intention of knowingly putting another person in fear of bodily injury.\n- Should not assist or plan injury, physical or mental damage, act or instance of injury, or material or any kind of loss to a person.\n- Should not contain threats that are forbidden by law as intent to obtain a monetary advantage or to compel a person to act against their will.\nCan\n- Provide information about threat is and the meaning of threat.", }, - max_tokens: { + evaluate: { description: - "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", - default: 2048, + "Choose if only the user input, the LLM output, or both should be evaluated. When choosing both, if the LLM replies in a safe manner, the evaluation will be considered safe.", + default: "input", }, - autodetect_dont_know: { + model: { description: - "Whether to autodetect 'I don't know' in the output to avoid failing the evaluation.", - default: true, + "The provider to use for evaluation. Only CloudFlare AI workers is supported for now.", + default: "cloudflare/thebloke/llamaguard-7b-awq", }, }, - envVars: [], + envVars: ["CLOUDFLARE_ACCOUNT_ID", "CLOUDFLARE_API_KEY"], result: { score: { + description: "How many violations were found in the content", + }, + passed: { description: - "A score between 0.0 and 1.0 indicating the faithfulness of the answer.", + "If true then the content is safe according to policy, if false then it is unsafe.", }, }, }, - "ragas/response_context_precision": { - name: `Ragas Response Context Precision`, + "legacy/ragas_answer_correctness": { + name: `Ragas Answer Correctness`, description: ` -Uses an LLM to measure the proportion of chunks in the retrieved context that were relevant to generate the output or the expected output. +Computes with an LLM a weighted combination of factual as well as semantic similarity between the generated answer and the expected output. `, category: "rag", docsUrl: - "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_precision/#context-precision-without-reference", + "https://docs.ragas.io/en/latest/concepts/metrics/answer_correctness.html", isGuardrail: false, - requiredFields: ["input", "contexts"], - optionalFields: ["output", "expected_output"], + requiredFields: ["output", "expected_output"], + optionalFields: ["input"], settings: { model: { description: "The model to use for evaluation.", default: "openai/gpt-4o-mini", }, + embeddings_model: { + description: "The model to use for embeddings.", + default: "openai/text-embedding-ada-002", + }, max_tokens: { description: "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", @@ -2119,26 +1978,30 @@ Uses an LLM to measure the proportion of chunks in the retrieved context that we result: { score: { description: - "A score between 0.0 and 1.0 indicating the precision of the retrieved context.", + "A score between 0.0 and 1.0 indicating the correctness of the answer.", }, }, }, - "ragas/response_context_recall": { - name: `Ragas Response Context Recall`, + "legacy/ragas_answer_relevancy": { + name: `Ragas Answer Relevancy`, description: ` -Uses an LLM to measure how many of relevant documents attributable the claims in the output were successfully retrieved in order to generate an expected output. +Evaluates how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy. `, category: "rag", docsUrl: - "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_recall/#llm-based-context-recall", + "https://docs.ragas.io/en/latest/concepts/metrics/answer_relevance.html", isGuardrail: false, - requiredFields: ["input", "output", "contexts", "expected_output"], + requiredFields: ["input", "output"], optionalFields: [], settings: { model: { description: "The model to use for evaluation.", default: "openai/gpt-4o-mini", }, + embeddings_model: { + description: "The model to use for embeddings.", + default: "openai/text-embedding-ada-002", + }, max_tokens: { description: "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", @@ -2149,142 +2012,132 @@ Uses an LLM to measure how many of relevant documents attributable the claims in result: { score: { description: - "A score between 0.0 and 1.0 indicating the recall of the retrieved context.", + "A score between 0.0 and 1.0 indicating the relevance of the answer.", }, }, }, - "ragas/response_relevancy": { - name: `Ragas Response Relevancy`, + "legacy/ragas_context_precision": { + name: `Ragas Context Precision`, description: ` -Evaluates how pertinent the generated answer is to the given prompt. Higher scores indicate better relevancy. +This metric evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Higher scores indicate better precision. `, - category: "quality", + category: "rag", docsUrl: - "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/answer_relevance/", + "https://docs.ragas.io/en/latest/concepts/metrics/context_precision.html", isGuardrail: false, - requiredFields: ["input", "output"], + requiredFields: ["input", "contexts", "expected_output"], optionalFields: [], settings: { model: { description: "The model to use for evaluation.", default: "openai/gpt-4o-mini", }, + embeddings_model: { + description: "The model to use for embeddings.", + default: "openai/text-embedding-ada-002", + }, max_tokens: { description: "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", default: 2048, }, - embeddings_model: { - description: "The model to use for embeddings.", - default: "openai/text-embedding-ada-002", - }, }, envVars: [], result: { score: { description: - "A score between 0.0 and 1.0 indicating the relevance of the answer.", + "A score between 0.0 and 1.0 indicating the precision of the context.", }, }, }, - "ragas/rouge_score": { - name: `ROUGE Score`, + "legacy/ragas_context_recall": { + name: `Ragas Context Recall`, description: ` -Traditional NLP metric. ROUGE score for evaluating the similarity between two strings. +This evaluator measures the extent to which the retrieved context aligns with the annotated answer, treated as the ground truth. Higher values indicate better performance. `, - category: "quality", + category: "rag", docsUrl: - "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/traditional/#rouge-score", + "https://docs.ragas.io/en/latest/concepts/metrics/context_recall.html", isGuardrail: false, - requiredFields: ["output", "expected_output"], + requiredFields: ["input", "contexts", "expected_output"], optionalFields: [], settings: { - rouge_type: { - description: "ROUGE type", - default: "rouge1", + model: { + description: "The model to use for evaluation.", + default: "openai/gpt-4o-mini", }, - measure_type: { - description: "ROUGE measure type", - default: "fmeasure", + embeddings_model: { + description: "The model to use for embeddings.", + default: "openai/text-embedding-ada-002", + }, + max_tokens: { + description: + "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", + default: 2048, }, }, envVars: [], result: { score: { - description: "ROUGE similarity score", + description: + "A score between 0.0 and 1.0 indicating the recall of the context.", }, }, }, - "ragas/rubrics_based_scoring": { - name: `Rubrics Based Scoring`, + "legacy/ragas_context_relevancy": { + name: `Ragas Context Relevancy`, description: ` -Rubric-based evaluation metric that is used to evaluate responses. The rubric consists of descriptions for each score, typically ranging from 1 to 5 +This metric gauges the relevancy of the retrieved context, calculated based on both the question and contexts. The values fall within the range of (0, 1), with higher values indicating better relevancy. `, - category: "quality", + category: "rag", docsUrl: - "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/general_purpose/#rubrics-based-criteria-scoring", + "https://docs.ragas.io/en/latest/concepts/metrics/context_relevancy.html", isGuardrail: false, - requiredFields: ["input", "output"], - optionalFields: ["expected_output"], + requiredFields: ["output", "contexts"], + optionalFields: [], settings: { model: { description: "The model to use for evaluation.", default: "openai/gpt-4o-mini", }, + embeddings_model: { + description: "The model to use for embeddings.", + default: "openai/text-embedding-ada-002", + }, max_tokens: { description: "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", default: 2048, }, - rubrics: { - description: undefined, - default: [ - { - description: "The response is incorrect, irrelevant.", - }, - { - description: - "The response partially answers the question but includes significant errors, omissions, or irrelevant information.", - }, - { - description: - "The response partially answers the question but includes minor errors, omissions, or irrelevant information.", - }, - { - description: - "The response fully answers the question and includes minor errors, omissions, or irrelevant information.", - }, - { - description: - "The response fully answers the question and includes no errors, omissions, or irrelevant information.", - }, - ], - }, }, envVars: [], result: { score: { description: - "A score according to the rubrics, typically between 1 and 5.", + "A score between 0.0 and 1.0 indicating the relevancy of the context.", }, }, }, - "ragas/sql_query_equivalence": { - name: `SQL Query Equivalence`, + "legacy/ragas_context_utilization": { + name: `Ragas Context Utilization`, description: ` -Checks if the SQL query is equivalent to a reference one by using an LLM to infer if it would generate the same results given the table schemas. +This metric evaluates whether all of the output relevant items present in the contexts are ranked higher or not. Higher scores indicate better utilization. `, - category: "quality", + category: "rag", docsUrl: - "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/sql/#sql-query-semantic-equivalence", + "https://docs.ragas.io/en/latest/concepts/metrics/context_precision.html", isGuardrail: false, - requiredFields: ["output", "expected_output", "expected_contexts"], + requiredFields: ["input", "output", "contexts"], optionalFields: [], settings: { model: { description: "The model to use for evaluation.", default: "openai/gpt-4o-mini", }, + embeddings_model: { + description: "The model to use for embeddings.", + default: "openai/text-embedding-ada-002", + }, max_tokens: { description: "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", @@ -2293,19 +2146,20 @@ Checks if the SQL query is equivalent to a reference one by using an LLM to infe }, envVars: [], result: { - passed: { - description: "Whether the SQL query is equivalent to the expected one.", + score: { + description: + "A score between 0.0 and 1.0 indicating the utilization of the context.", }, }, }, - "ragas/summarization_score": { - name: `Summarization Score`, + "legacy/ragas_faithfulness": { + name: `Ragas Faithfulness`, description: ` -Measures how well the summary captures important information from the retrieved contexts. +This evaluator assesses the extent to which the generated answer is consistent with the provided context. Higher scores indicate better faithfulness to the context, useful for detecting hallucinations. `, - category: "quality", + category: "rag", docsUrl: - "https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/summarization_score/", + "https://docs.ragas.io/en/latest/concepts/metrics/faithfulness.html", isGuardrail: false, requiredFields: ["output", "contexts"], optionalFields: [], @@ -2314,6 +2168,10 @@ Measures how well the summary captures important information from the retrieved description: "The model to use for evaluation.", default: "openai/gpt-4o-mini", }, + embeddings_model: { + description: "The model to use for embeddings.", + default: "openai/text-embedding-ada-002", + }, max_tokens: { description: "The maximum number of tokens allowed for evaluation, a too high number can be costly. Entries above this amount will be skipped.", @@ -2324,7 +2182,7 @@ Measures how well the summary captures important information from the retrieved result: { score: { description: - "A score between 0.0 and 1.0 indicating the summarization quality.", + "A score between 0.0 and 1.0 indicating the faithfulness of the answer.", }, }, }, @@ -2387,4 +2245,146 @@ social security numbers. It allows customization of the detection threshold and }, }, }, + "example/word_count": { + name: `Example Evaluator`, + description: ` +This evaluator serves as a boilerplate for creating new evaluators. +`, + category: "other", + docsUrl: "https://path/to/official/docs", + isGuardrail: false, + requiredFields: ["output"], + optionalFields: [], + settings: {}, + envVars: ["NECESSARY_ENV_VAR"], + result: { + score: { + description: "How many words are there in the output, split by space", + }, + }, + }, + "lingua/language_detection": { + name: `Lingua Language Detection`, + description: ` +This evaluator detects the language of the input and output text to check for example if the generated answer is in the same language as the prompt, +or if it's in a specific expected language. +`, + category: "quality", + docsUrl: "https://github.com/pemistahl/lingua-py", + isGuardrail: true, + requiredFields: ["output"], + optionalFields: ["input"], + settings: { + check_for: { + description: "What should be checked", + default: "input_matches_output", + }, + expected_language: { + description: "The specific language that the output is expected to be", + default: undefined, + }, + min_words: { + description: + "Minimum number of words to check, as the language detection can be unreliable for very short texts. Inputs shorter than the minimum will be skipped.", + default: 7, + }, + threshold: { + description: + "Minimum confidence threshold for the language detection. If the confidence is lower than this, the evaluation will be skipped.", + default: 0.25, + }, + }, + envVars: [], + result: { + passed: { + description: + "Passes if the detected language on the output matches the detected language on the input, or if the output matches the expected language", + }, + label: { + description: + "Language detected on the input for input_matches_output, or language detected on the output for output_matches_language", + }, + }, + }, + "azure/content_safety": { + name: `Azure Content Safety`, + description: ` +This evaluator detects potentially unsafe content in text, including hate speech, +self-harm, sexual content, and violence. It allows customization of the severity +threshold and the specific categories to check. +`, + category: "safety", + docsUrl: + "https://learn.microsoft.com/en-us/azure/ai-services/content-safety/quickstart-text", + isGuardrail: true, + requiredFields: [], + optionalFields: ["input", "output"], + settings: { + severity_threshold: { + description: + "The minimum severity level to consider content as unsafe, from 1 to 7.", + default: 1, + }, + categories: { + description: "The categories of moderation to check for.", + default: { + Hate: true, + SelfHarm: true, + Sexual: true, + Violence: true, + }, + }, + output_type: { + description: + "The type of severity levels to return on the full 0-7 severity scale, it can be either the trimmed version with four values (0, 2, 4, 6 scores) or the whole range.", + default: "FourSeverityLevels", + }, + }, + envVars: ["AZURE_CONTENT_SAFETY_ENDPOINT", "AZURE_CONTENT_SAFETY_KEY"], + result: { + score: { + description: + "The severity level of the detected content from 0 to 7. A higher score indicates higher severity.", + }, + }, + }, + "azure/jailbreak": { + name: `Azure Jailbreak Detection`, + description: ` +This evaluator checks for jailbreak-attempt in the input using Azure's Content Safety API. +`, + category: "safety", + docsUrl: "", + isGuardrail: true, + requiredFields: ["input"], + optionalFields: [], + settings: {}, + envVars: ["AZURE_CONTENT_SAFETY_ENDPOINT", "AZURE_CONTENT_SAFETY_KEY"], + result: { + passed: { + description: + "If true then no jailbreak was detected, if false then a jailbreak was detected", + }, + }, + }, + "azure/prompt_injection": { + name: `Azure Prompt Shield`, + description: ` +This evaluator checks for prompt injection attempt in the input and the contexts using Azure's Content Safety API. +`, + category: "safety", + docsUrl: + "https://learn.microsoft.com/en-us/azure/ai-services/content-safety/concepts/jailbreak-detection", + isGuardrail: true, + requiredFields: ["input"], + optionalFields: ["contexts"], + settings: {}, + envVars: ["AZURE_CONTENT_SAFETY_ENDPOINT", "AZURE_CONTENT_SAFETY_KEY"], + result: { + passed: { + description: + "If true then no prompt injection was detected, if false then a prompt injection was detected", + }, + }, + }, };