Skip to content

Commit 45da268

Browse files
committed
generate mdx list and open api
1 parent 94605f8 commit 45da268

File tree

2 files changed

+204
-2
lines changed

2 files changed

+204
-2
lines changed

scripts/generate_mdx_list.py

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Python script to generate MDX list from OpenAPI JSON
4+
"""
5+
6+
import json
7+
import sys
8+
from pathlib import Path
9+
from typing import Dict, Any, List
10+
11+
12+
def categorize_evaluators(
13+
evaluators: Dict[str, Any],
14+
) -> Dict[str, Dict[str, Any]]:
15+
"""Categorize evaluators based on their paths and descriptions."""
16+
17+
categories = {
18+
"Expected Answer Evaluation": {
19+
"description": "For when you have the golden answer and want to measure how correct the LLM gets it",
20+
"evaluators": [],
21+
},
22+
"LLM-as-Judge": {
23+
"description": "For when you don't have a golden answer, but have a set of rules for another LLM to evaluate quality",
24+
"evaluators": [],
25+
},
26+
"RAG Quality": {
27+
"description": "For measuring the quality of your RAG, check for hallucinations with faithfulness and precision/recall",
28+
"evaluators": [],
29+
},
30+
"Quality Aspects Evaluation": {
31+
"description": "For when you want to check the language, structure, style and other general quality metrics",
32+
"evaluators": [],
33+
},
34+
"Safety": {
35+
"description": "Check for PII, prompt injection attempts and toxic content",
36+
"evaluators": [],
37+
},
38+
"Other": {"description": "Miscellaneous evaluators", "evaluators": []},
39+
}
40+
41+
for path, path_info in evaluators.items():
42+
if not path.endswith("/evaluate"):
43+
continue
44+
45+
evaluator_id = path.replace("/evaluate", "")
46+
post_info = path_info.get("post", {})
47+
summary = post_info.get("summary", evaluator_id)
48+
description = post_info.get("description", "")
49+
50+
# Convert evaluator name to proper endpoint format
51+
# Use the evaluator name and convert to kebab-case
52+
endpoint_id = summary.lower()
53+
# Replace spaces and special characters with hyphens
54+
endpoint_id = endpoint_id.replace(" ", "-")
55+
endpoint_id = endpoint_id.replace("_", "-")
56+
endpoint_id = endpoint_id.replace("/", "-")
57+
# Remove any non-alphanumeric characters except hyphens
58+
import re
59+
60+
endpoint_id = re.sub(r"[^a-z0-9\-]", "", endpoint_id)
61+
# Remove multiple consecutive hyphens
62+
endpoint_id = re.sub(r"-+", "-", endpoint_id)
63+
# Remove leading/trailing hyphens
64+
endpoint_id = endpoint_id.strip("-")
65+
66+
evaluator_info = {
67+
"id": evaluator_id,
68+
"name": summary,
69+
"description": description,
70+
"endpoint": f"/api-reference/{endpoint_id}",
71+
}
72+
73+
# Categorize based on path and description
74+
if any(
75+
keyword in evaluator_id.lower()
76+
for keyword in [
77+
"exact_match",
78+
"llm_answer_match",
79+
"factual",
80+
"sql_query",
81+
"rouge",
82+
"bleu",
83+
]
84+
):
85+
categories["Expected Answer Evaluation"]["evaluators"].append(
86+
evaluator_info
87+
)
88+
elif any(
89+
keyword in evaluator_id.lower()
90+
for keyword in ["llm_boolean", "llm_score", "llm_category", "rubrics"]
91+
):
92+
categories["LLM-as-Judge"]["evaluators"].append(evaluator_info)
93+
elif any(
94+
keyword in evaluator_id.lower()
95+
for keyword in [
96+
"faithfulness",
97+
"context_precision",
98+
"context_recall",
99+
"context_f1",
100+
"response_relevancy",
101+
"response_context",
102+
]
103+
):
104+
categories["RAG Quality"]["evaluators"].append(evaluator_info)
105+
elif any(
106+
keyword in evaluator_id.lower()
107+
for keyword in ["language_detection", "valid_format", "summarization"]
108+
):
109+
categories["Quality Aspects Evaluation"]["evaluators"].append(
110+
evaluator_info
111+
)
112+
elif any(
113+
keyword in evaluator_id.lower()
114+
for keyword in [
115+
"pii",
116+
"jailbreak",
117+
"prompt_injection",
118+
"content_safety",
119+
"moderation",
120+
"llama_guard",
121+
]
122+
):
123+
categories["Safety"]["evaluators"].append(evaluator_info)
124+
else:
125+
categories["Other"]["evaluators"].append(evaluator_info)
126+
127+
return categories
128+
129+
130+
def generate_mdx(categories: Dict[str, Any]) -> str:
131+
"""Generate MDX content."""
132+
133+
mdx_content = []
134+
135+
for category_name, category_info in categories.items():
136+
if not category_info["evaluators"]:
137+
continue
138+
139+
mdx_content.append(f"## {category_name}")
140+
mdx_content.append(f"{category_info['description']}")
141+
mdx_content.append("")
142+
mdx_content.append("| Evaluator | Description |")
143+
mdx_content.append("| --------- | ----------- |")
144+
145+
for evaluator in category_info["evaluators"]:
146+
# Clean description to remove newlines but keep full text
147+
desc = evaluator["description"]
148+
# Remove newlines and normalize whitespace
149+
desc = " ".join(desc.split())
150+
151+
mdx_content.append(
152+
f"| [{evaluator['name']}]({evaluator['endpoint']}) | {desc} |"
153+
)
154+
155+
mdx_content.append("")
156+
157+
return "\n".join(mdx_content)
158+
159+
160+
def main():
161+
"""Main function to generate MDX list."""
162+
try:
163+
# Find the openapi.json file
164+
script_dir = Path(__file__).parent
165+
openapi_file = script_dir / "openapi.json"
166+
167+
if not openapi_file.exists():
168+
print(f"Error: Could not find OpenAPI file at {openapi_file}")
169+
sys.exit(1)
170+
171+
print(f"Reading OpenAPI from: {openapi_file}")
172+
173+
with open(openapi_file, "r", encoding="utf-8") as f:
174+
openapi_data = json.load(f)
175+
176+
paths = openapi_data.get("paths", {})
177+
categories = categorize_evaluators(paths)
178+
mdx_content = generate_mdx(categories)
179+
180+
# Write the MDX content to a file
181+
output_path = script_dir / "evaluators-list.mdx"
182+
with open(output_path, "w", encoding="utf-8") as f:
183+
f.write(mdx_content)
184+
185+
print(f"MDX list generated successfully at: {output_path}")
186+
187+
# Print summary
188+
total_evaluators = sum(len(cat["evaluators"]) for cat in categories.values())
189+
active_categories = len(
190+
[cat for cat in categories.values() if cat["evaluators"]]
191+
)
192+
print(
193+
f"Generated {total_evaluators} evaluators across {active_categories} categories"
194+
)
195+
196+
except Exception as error:
197+
print(f"Error generating MDX list: {error}")
198+
sys.exit(1)
199+
200+
201+
if __name__ == "__main__":
202+
main()

scripts/generate_openapi.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -672,7 +672,7 @@ def generate_openapi_schema(evaluators: Dict[str, Any]) -> Dict[str, Any]:
672672
{
673673
"lang": "python",
674674
"label": "Offline Evaluation",
675-
"source": f'import langwatch\n\ndf = langwatch.dataset.get_dataset("dataset-id").to_pandas()\n\nevaluation = langwatch.evaluation.init("my-incredible-evaluation")\n\nfor index, row in evaluation.loop(df.iterrows()):\n\n # your execution code here\n \n evaluation.run(\n "{evaluator_id}",\n index=index,\n data={{\n{chr(10).join(f" \"{field}\": {field}," if field != "expected_output" else f" \"{field}\": row[\"{field}\"]," for field in evaluator.get("requiredFields", []) + evaluator.get("optionalFields", []))}\n }},\n settings={{}}\n )\n',
675+
"source": f'import langwatch\n\ndf = langwatch.dataset.get_dataset("dataset-id").to_pandas()\n\nevaluation = langwatch.evaluation.init("my-incredible-evaluation")\n\nfor index, row in evaluation.loop(df.iterrows()):\n # your execution code here \n evaluation.run(\n "{evaluator_id}",\n index=index,\n data={{\n{chr(10).join(f" \"{field}\": row[\"{field}\"]," if field in ["input", "contexts", "expected_output"] else f" \"{field}\": {field}," for field in evaluator.get("requiredFields", []) + evaluator.get("optionalFields", []))}\n }},\n settings={{}}\n )\n',
676676
},
677677
{
678678
"lang": "python",
@@ -745,7 +745,7 @@ def generate_openapi_schema(evaluators: Dict[str, Any]) -> Dict[str, Any]:
745745
{
746746
"lang": "python",
747747
"label": "Offline Evaluation",
748-
"source": f'import langwatch\n\ndf = langwatch.dataset.get_dataset("dataset-id").to_pandas()\n\nevaluation = langwatch.evaluation.init("my-incredible-evaluation")\n\nfor index, row in evaluation.loop(df.iterrows()):\n\n # your execution code here\n \n evaluation.run(\n "{evaluator_id}",\n index=index,\n data={{\n{chr(10).join(f" \"{field}\": {field}," if field != "expected_output" else f" \"{field}\": row[\"{field}\"]," for field in evaluator.get("requiredFields", []) + evaluator.get("optionalFields", []))}\n }},\n settings={{}}\n )\n',
748+
"source": f'import langwatch\n\ndf = langwatch.dataset.get_dataset("dataset-id").to_pandas()\n\nevaluation = langwatch.evaluation.init("my-incredible-evaluation")\n\nfor index, row in evaluation.loop(df.iterrows()):\n # your execution code here \n evaluation.run(\n "{evaluator_id}",\n index=index,\n data={{\n{chr(10).join(f" \"{field}\": row[\"{field}\"]," if field in ["input", "contexts", "expected_output"] else f" \"{field}\": {field}," for field in evaluator.get("requiredFields", []) + evaluator.get("optionalFields", []))}\n }},\n settings={{}}\n )\n',
749749
},
750750
{
751751
"lang": "python",

0 commit comments

Comments
 (0)