Skip to content

Commit 37bcf63

Browse files
committed
.
1 parent c178cb9 commit 37bcf63

File tree

3 files changed

+62
-13
lines changed

3 files changed

+62
-13
lines changed

backend/onyx/chat/answer_scratchpad.py

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,9 @@ def web_search(query: str, outer_ctx: Dict[str, Any]) -> Dict[str, Any]:
7676
register_tool(
7777
ToolSpec(
7878
name="web_search",
79-
description="Search the web for information.",
79+
description="""
80+
Search the web for information. This tool provides urls and short snippets,
81+
but does not fetch the full content of the urls.""",
8082
parameters={
8183
"type": "object",
8284
"properties": {"query": {"type": "string"}},
@@ -110,7 +112,7 @@ def web_fetch(urls: List[str], outer_ctx: Dict[str, Any]) -> Dict[str, Any]:
110112
register_tool(
111113
ToolSpec(
112114
name="web_fetch",
113-
description="Fetch the contents of a list of URLs.",
115+
description="Fetch the fullcontents of a list of URLs.",
114116
parameters={
115117
"type": "object",
116118
"properties": {"urls": {"type": "array", "items": {"type": "string"}}},
@@ -134,24 +136,45 @@ def reasoning(outer_ctx: Dict[str, Any]) -> Dict[str, Any]:
134136
revised_messages = [
135137
{"role": "system", "content": PRIVATE_SCRATCHPAD_SYS},
136138
] + messages[1:]
137-
results = litellm.completion(
138-
model=llm.config.model_name,
139+
results = llm_completion(
140+
model_name=llm.config.model_name,
139141
temperature=llm.config.temperature,
140142
messages=revised_messages,
143+
tools=[],
144+
stream=False,
141145
)
142146
return {"results": results["choices"][0]["message"]["content"]}
143147

144148

145149
register_tool(
146150
ToolSpec(
147151
name="reasoning",
148-
description="Reason about the message history and the goal.",
152+
description="""
153+
Use this tool for reasoning. Powerful for complex questions and
154+
tasks, or questions that require multiple steps to answer.""",
149155
parameters={"type": "object", "properties": {}, "required": []},
150156
func=reasoning,
151157
)
152158
)
153159

154160

161+
@traced(name="llm_completion", type="llm")
162+
def llm_completion(
163+
model_name: str,
164+
temperature: float,
165+
messages: List[Dict[str, Any]],
166+
tools: List[Dict[str, Any]],
167+
stream: bool = False,
168+
) -> Dict[str, Any]:
169+
return litellm.completion(
170+
model=model_name,
171+
temperature=temperature,
172+
messages=messages,
173+
tools=tools,
174+
stream=stream,
175+
)
176+
177+
155178
def tool_specs_for_openai() -> List[Dict[str, Any]]:
156179
return [
157180
{
@@ -197,12 +220,12 @@ def stream_chat_sync(
197220
yield {"type": "delta", "text": "\n[Timed out while composing reply]"}
198221
break
199222
# Start a streaming completion (sync iterator of deltas)
200-
stream_iter = litellm.completion(
201-
model=llm.config.model_name,
223+
stream_iter = llm_completion(
224+
model_name=llm.config.model_name,
202225
temperature=llm.config.temperature,
203226
messages=messages,
204227
tools=tools_decl,
205-
stream=True, # iterator of chunks
228+
stream=True,
206229
)
207230

208231
# Accumulate assistant text & tool call chunks

backend/onyx/evals/one_off/create_braintrust_dataset.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,7 @@ def parse_csv_file(csv_path: str) -> List[Dict[str, Any]]:
109109
records.extend(
110110
[
111111
{
112-
"question": question
113-
+ ". All info is contained in the quesiton. DO NOT ask any clarifying questions.",
112+
"question": question,
114113
"research_type": "DEEP",
115114
"categories": categories,
116115
"expected_depth": expected_depth,

backend/onyx/evals/providers/braintrust.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from collections.abc import Callable
22

3-
from autoevals import Factuality
3+
from autoevals.llm import LLMClassifier
44
from braintrust import Eval
55
from braintrust import EvalCase
66
from braintrust import init_dataset
@@ -12,6 +12,33 @@
1212
from onyx.evals.models import EvalProvider
1313

1414

15+
quality_classifier = LLMClassifier(
16+
name="quality",
17+
prompt_template="""
18+
You are a customer doing a trial of the product Onyx. Onyx provides a UI for users to chat with an LLM
19+
and search for information, similar to ChatGPT. You think ChatGPT's answer quality is great, and
20+
you want to rate Onyx's response relativeto ChatGPT's response.\n
21+
[Question]: {{input}}\n
22+
[ChatGPT Answer]: {{expected}}\n
23+
[Onyx Answer]: {{output}}\n
24+
25+
Please rate the quality of the Onyx answer relative to the ChatGPT answer on a scale of A to E:
26+
A: The Onyx answer is great and is as good or better than the ChatGPT answer.
27+
B: The Onyx answer is good and and comparable to the ChatGPT answer.
28+
C: The Onyx answer is fair.
29+
D: The Onyx answer is poor and is worse than the ChatGPT answer.
30+
E: The Onyx answer is terrible and is much worse than the ChatGPT answer.
31+
""",
32+
choice_scores={
33+
"A": 1,
34+
"B": 0.75,
35+
"C": 0.5,
36+
"D": 0.25,
37+
"E": 0,
38+
},
39+
)
40+
41+
1542
class BraintrustEvalProvider(EvalProvider):
1643
def eval(
1744
self,
@@ -34,7 +61,7 @@ def eval(
3461
name=BRAINTRUST_PROJECT,
3562
data=eval_data,
3663
task=task,
37-
scores=[Factuality()],
64+
scores=[quality_classifier],
3865
metadata={**configuration.model_dump()},
3966
max_concurrency=BRAINTRUST_MAX_CONCURRENCY,
4067
no_send_logs=no_send_logs,
@@ -51,7 +78,7 @@ def eval(
5178
name=BRAINTRUST_PROJECT,
5279
data=eval_cases,
5380
task=task,
54-
scores=[Factuality()],
81+
scores=[quality_classifier],
5582
metadata={**configuration.model_dump()},
5683
max_concurrency=BRAINTRUST_MAX_CONCURRENCY,
5784
no_send_logs=no_send_logs,

0 commit comments

Comments
 (0)