Skip to content

Commit d044ef7

Browse files
rkuo-danswerRichard Kuo (Onyx)
andauthored
tool to generate vespa schema variations for our cloud (onyx-dot-app#4556)
* tool to generate vespa schema variations for our cloud * extraneous assign * float, not double * back to double --------- Co-authored-by: Richard Kuo (Onyx) <rkuo@onyx.app>
1 parent 6b1f50c commit d044ef7

File tree

3 files changed

+68
-16
lines changed

3 files changed

+68
-16
lines changed

backend/onyx/document_index/vespa/app_config/schemas/danswer_chunk.sd

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -111,16 +111,16 @@ schema DANSWER_CHUNK_NAME {
111111
indexing: summary | attribute
112112
}
113113
field primary_owners type array<string> {
114-
indexing : summary | attribute
114+
indexing: summary | attribute
115115
}
116116
field secondary_owners type array<string> {
117-
indexing : summary | attribute
117+
indexing: summary | attribute
118118
}
119119
field access_control_list type weightedset<string> {
120120
indexing: summary | attribute
121121
rank: filter
122122
attribute: fast-search
123-
}
123+
}
124124
field document_sets type weightedset<string> {
125125
indexing: summary | attribute
126126
rank: filter
@@ -149,7 +149,7 @@ schema DANSWER_CHUNK_NAME {
149149

150150
rank-profile default_rank {
151151
inputs {
152-
query(decay_factor) float
152+
query(decay_factor) double
153153
}
154154

155155
function inline document_boost() {
@@ -318,10 +318,4 @@ schema DANSWER_CHUNK_NAME {
318318
expression: bm25(content) + (5 * bm25(title))
319319
}
320320
}
321-
322-
rank-profile random_ {
323-
first-phase {
324-
expression: random.match
325-
}
326-
}
327321
}

backend/onyx/document_index/vespa/index.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,13 @@ def _replace_template_values_in_schema(
133133
)
134134

135135

136+
def _replace_tenant_template_value_in_schema(
137+
schema_template: str,
138+
tenant_field: str,
139+
) -> str:
140+
return schema_template.replace(TENANT_ID_PAT, tenant_field)
141+
142+
136143
def add_ngrams_to_schema(schema_content: str) -> str:
137144
# Add the match blocks containing gram and gram-size to title and content fields
138145
schema_content = re.sub(
@@ -242,17 +249,15 @@ def ensure_indices_exist(
242249

243250
with open(schema_file, "r") as schema_f:
244251
schema_template = schema_f.read()
245-
schema_template = schema_template.replace(TENANT_ID_PAT, "")
246-
252+
schema = _replace_tenant_template_value_in_schema(schema_template, "")
247253
schema = _replace_template_values_in_schema(
248-
schema_template,
254+
schema,
249255
self.index_name,
250256
primary_embedding_dim,
251257
primary_embedding_precision,
252258
)
253259

254260
schema = add_ngrams_to_schema(schema) if needs_reindexing else schema
255-
schema = schema.replace(TENANT_ID_PAT, "")
256261
zip_dict[f"schemas/{schema_names[0]}.sd"] = schema.encode("utf-8")
257262

258263
if self.secondary_index_name:
@@ -352,9 +357,14 @@ def register_multitenant_indices(
352357
schema = _replace_template_values_in_schema(
353358
schema_template, index_name, embedding_dim, embedding_precision
354359
)
355-
schema = schema.replace(
356-
TENANT_ID_PAT, TENANT_ID_REPLACEMENT if MULTI_TENANT else ""
360+
361+
tenant_id_replacement = ""
362+
if MULTI_TENANT:
363+
tenant_id_replacement = TENANT_ID_REPLACEMENT
364+
schema = _replace_tenant_template_value_in_schema(
365+
schema, tenant_id_replacement
357366
)
367+
358368
schema = add_ngrams_to_schema(schema) if needs_reindexing else schema
359369
zip_dict[f"schemas/{index_name}.sd"] = schema.encode("utf-8")
360370

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
"""Tool to generate all supported schema variations for Onyx Cloud's Vespa database."""
2+
3+
import argparse
4+
5+
from onyx.db.enums import EmbeddingPrecision
6+
from onyx.document_index.vespa.index import _replace_template_values_in_schema
7+
from onyx.document_index.vespa.index import _replace_tenant_template_value_in_schema
8+
from onyx.document_index.vespa_constants import TENANT_ID_REPLACEMENT
9+
from onyx.utils.logger import setup_logger
10+
from shared_configs.configs import SUPPORTED_EMBEDDING_MODELS
11+
12+
logger = setup_logger()
13+
14+
15+
def write_schema(index_name: str, dim: int, template: str) -> None:
16+
index_filename = index_name + ".sd"
17+
index_rendered_str = _replace_tenant_template_value_in_schema(
18+
template, TENANT_ID_REPLACEMENT
19+
)
20+
index_rendered_str = _replace_template_values_in_schema(
21+
index_rendered_str, index_name, dim, EmbeddingPrecision.FLOAT
22+
)
23+
24+
with open(index_filename, "w", encoding="utf-8") as f:
25+
f.write(index_rendered_str)
26+
27+
logger.info(f"Wrote {index_filename}")
28+
29+
30+
def main() -> None:
31+
parser = argparse.ArgumentParser(description="Generate multi tenant Vespa schemas")
32+
parser.add_argument("--template", help="The schema template to use", required=True)
33+
args = parser.parse_args()
34+
35+
with open(args.template, "r", encoding="utf-8") as f:
36+
template_str = f.read()
37+
38+
num_indexes = 0
39+
for model in SUPPORTED_EMBEDDING_MODELS:
40+
write_schema(model.index_name, model.dim, template_str)
41+
write_schema(model.index_name + "__danswer_alt_index", model.dim, template_str)
42+
num_indexes += 2
43+
44+
logger.info(f"Wrote {num_indexes} indexes.")
45+
46+
47+
if __name__ == "__main__":
48+
main()

0 commit comments

Comments
 (0)