Skip to content

Commit adf33de

Browse files
Orbital-Webgreptile-apps[bot]
authored andcommitted
feat: search quality eval (onyx-dot-app#4720)
* fix: import order * test examples * fix: import * wip: reranker based eval * fix: import order * feat: adjuted score * fix: mypy * fix: suggestions * sorry cvs, you must go Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> * fix: mypy * fix: suggestions --------- Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
1 parent abc51ae commit adf33de

File tree

6 files changed

+558
-1
lines changed

6 files changed

+558
-1
lines changed

backend/onyx/context/search/utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
from onyx.context.search.models import SavedSearchDocWithContent
1313
from onyx.context.search.models import SearchDoc
1414
from onyx.db.models import SearchDoc as DBSearchDoc
15+
from onyx.utils.logger import setup_logger
16+
17+
logger = setup_logger()
1518

1619

1720
T = TypeVar(
@@ -154,5 +157,6 @@ def remove_stop_words_and_punctuation(keywords: list[str]) -> list[str]:
154157
if (word.casefold() not in stop_words and word not in string.punctuation)
155158
]
156159
return text_trimmed or word_tokens
157-
except Exception:
160+
except Exception as e:
161+
logger.warning(f"Error removing stop words and punctuation: {e}")
158162
return keywords
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# Search Quality Test Script
2+
3+
This Python script evaluates the search results for a list of queries.
4+
5+
Unlike the script in answer_quality, this script is much less customizable and runs using currently ingested documents, though it allows for quick testing of search parameters on a bunch of test queries that don't have well-defined answers.
6+
7+
## Usage
8+
9+
1. Ensure you have the required dependencies installed and onyx running.
10+
11+
2. Ensure a reranker model is configured in the search settings.
12+
This can be checked/modified by opening the admin panel, going to search settings, and ensuring a reranking model is set.
13+
14+
3. Set up the PYTHONPATH permanently:
15+
Add the following line to your shell configuration file (e.g., `~/.bashrc`, `~/.zshrc`, or `~/.bash_profile`):
16+
```
17+
export PYTHONPATH=$PYTHONPATH:/path/to/onyx/backend
18+
```
19+
Replace `/path/to/onyx` with the actual path to your Onyx repository.
20+
After adding this line, restart your terminal or run `source ~/.bashrc` (or the appropriate config file) to apply the changes.
21+
22+
4. Navigate to Onyx repo, search_quality folder:
23+
24+
```
25+
cd path/to/onyx/backend/tests/regression/search_quality
26+
```
27+
28+
5. Copy `search_queries.json.template` to `search_queries.json` and add/remove test queries in it
29+
30+
6. Run `generate_search_queries.py` to generate the modified queries for the search pipeline
31+
32+
```
33+
python generate_search_queries.py
34+
```
35+
36+
7. Copy `search_eval_config.yaml.template` to `search_eval_config.yaml` and specify the search and eval parameters
37+
8. Run `run_search_eval.py` to evaluate the search results against the reranked results
38+
39+
```
40+
python run_search_eval.py
41+
```
42+
43+
9. Repeat steps 7 and 8 to test and compare different search parameters
44+
45+
## Metrics
46+
- Jaccard Similarity: the ratio between the intersect and the union between the topk search and rerank results. Higher is better
47+
- Average Rank Change: The average absolute rank difference of the topk reranked chunks vs the entire search chunks. Lower is better
48+
- Average Missing Chunk Ratio: The number of chunks in the topk reranked chunks not in the topk search chunks, over topk. Lower is better
49+
50+
Note that all of these metrics are affected by very narrow search results.
51+
E.g., if topk is 20 but there is only 1 relevant document, the other 19 documents could be ordered arbitrarily, resulting in a lower score.
52+
53+
54+
To address this limitation, there are score adjusted versions of the metrics.
55+
The score adjusted version does not use a fixed topk, but computes the optimum topk based on the rerank scores.
56+
This generally works in determining how many documents are relevant, although note that this approach isn't perfect.
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
import json
2+
from pathlib import Path
3+
4+
from langgraph.types import StreamWriter
5+
6+
from onyx.agents.agent_search.basic.utils import process_llm_stream
7+
from onyx.chat.models import PromptConfig
8+
from onyx.chat.prompt_builder.answer_prompt_builder import AnswerPromptBuilder
9+
from onyx.chat.prompt_builder.answer_prompt_builder import default_build_system_message
10+
from onyx.chat.prompt_builder.answer_prompt_builder import default_build_user_message
11+
from onyx.configs.app_configs import POSTGRES_API_SERVER_POOL_OVERFLOW
12+
from onyx.configs.app_configs import POSTGRES_API_SERVER_POOL_SIZE
13+
from onyx.configs.constants import DEFAULT_PERSONA_ID
14+
from onyx.db.engine import get_session_with_current_tenant
15+
from onyx.db.engine import SqlEngine
16+
from onyx.db.persona import get_persona_by_id
17+
from onyx.llm.factory import get_llms_for_persona
18+
from onyx.llm.interfaces import LLM
19+
from onyx.tools.tool_implementations.search.search_tool import SearchTool
20+
from onyx.tools.utils import explicit_tool_calling_supported
21+
from onyx.utils.logger import setup_logger
22+
23+
logger = setup_logger()
24+
25+
26+
def _load_queries() -> list[str]:
27+
current_dir = Path(__file__).parent
28+
with open(current_dir / "search_queries.json", "r") as file:
29+
return json.load(file)
30+
31+
32+
def _modify_one_query(
33+
query: str,
34+
llm: LLM,
35+
prompt_config: PromptConfig,
36+
tool_definition: dict,
37+
writer: StreamWriter = lambda _: None,
38+
) -> str:
39+
prompt_builder = AnswerPromptBuilder(
40+
user_message=default_build_user_message(
41+
user_query=query,
42+
prompt_config=prompt_config,
43+
files=[],
44+
single_message_history=None,
45+
),
46+
system_message=default_build_system_message(prompt_config, llm.config),
47+
message_history=[],
48+
llm_config=llm.config,
49+
raw_user_query=query,
50+
raw_user_uploaded_files=[],
51+
single_message_history=None,
52+
)
53+
prompt = prompt_builder.build()
54+
55+
stream = llm.stream(
56+
prompt=prompt,
57+
tools=[tool_definition],
58+
tool_choice="required",
59+
structured_response_format=None,
60+
)
61+
tool_message = process_llm_stream(
62+
messages=stream,
63+
should_stream_answer=False,
64+
writer=writer,
65+
)
66+
return (
67+
tool_message.tool_calls[0]["args"]["query"]
68+
if tool_message.tool_calls
69+
else query
70+
)
71+
72+
73+
class SearchToolOverride(SearchTool):
74+
def __init__(self) -> None:
75+
# do nothing, the tool_definition function doesn't require variables to be initialized
76+
pass
77+
78+
79+
def generate_search_queries() -> None:
80+
SqlEngine.init_engine(
81+
pool_size=POSTGRES_API_SERVER_POOL_SIZE,
82+
max_overflow=POSTGRES_API_SERVER_POOL_OVERFLOW,
83+
)
84+
85+
queries = _load_queries()
86+
87+
with get_session_with_current_tenant() as db_session:
88+
persona = get_persona_by_id(DEFAULT_PERSONA_ID, None, db_session)
89+
llm, _ = get_llms_for_persona(persona)
90+
prompt_config = PromptConfig.from_model(persona.prompts[0])
91+
tool_definition = SearchToolOverride().tool_definition()
92+
93+
tool_call_supported = explicit_tool_calling_supported(
94+
llm.config.model_provider, llm.config.model_name
95+
)
96+
97+
if tool_call_supported:
98+
logger.info(
99+
"Tool calling is supported for the current model. Modifying queries."
100+
)
101+
modified_queries = [
102+
_modify_one_query(
103+
query=query,
104+
llm=llm,
105+
prompt_config=prompt_config,
106+
tool_definition=tool_definition,
107+
)
108+
for query in queries
109+
]
110+
else:
111+
logger.warning(
112+
"Tool calling is not supported for the current model. "
113+
"Using the original queries."
114+
)
115+
modified_queries = queries
116+
117+
with open("search_queries_modified.json", "w") as file:
118+
json.dump(modified_queries, file, indent=4)
119+
120+
logger.info("Exported modified queries to search_queries_modified.json")
121+
122+
123+
if __name__ == "__main__":
124+
generate_search_queries()

0 commit comments

Comments
 (0)