onyx-dot-app · rkuo-danswer · Apr 18, 2025 · Apr 18, 2025 · Apr 18, 2025 · Apr 18, 2025
@@ -111,16 +111,16 @@ schema DANSWER_CHUNK_NAME {
             indexing: summary | attribute
         }
         field primary_owners type array<string> {
-            indexing : summary | attribute
+            indexing: summary | attribute
         }
         field secondary_owners type array<string> {
-            indexing : summary | attribute
+            indexing: summary | attribute
         }
         field access_control_list type weightedset<string> {
             indexing: summary | attribute
             rank: filter
             attribute: fast-search
-        } 
+        }
         field document_sets type weightedset<string> {
             indexing: summary | attribute
             rank: filter
@@ -149,7 +149,7 @@ schema DANSWER_CHUNK_NAME {
 
     rank-profile default_rank {
         inputs {
-            query(decay_factor) float
+            query(decay_factor) double
         }
 
         function inline document_boost() {
@@ -318,10 +318,4 @@ schema DANSWER_CHUNK_NAME {
             expression: bm25(content) + (5 * bm25(title))
         }
     }
-
-    rank-profile random_ {
-        first-phase {
-            expression: random.match
-        }
-    }
 }
@@ -133,6 +133,13 @@ def _replace_template_values_in_schema(
     )
 
 
+def _replace_tenant_template_value_in_schema(
+    schema_template: str,
+    tenant_field: str,
+) -> str:
+    return schema_template.replace(TENANT_ID_PAT, tenant_field)
+
+
 def add_ngrams_to_schema(schema_content: str) -> str:
     # Add the match blocks containing gram and gram-size to title and content fields
     schema_content = re.sub(
@@ -242,17 +249,15 @@ def ensure_indices_exist(
 
         with open(schema_file, "r") as schema_f:
             schema_template = schema_f.read()
-        schema_template = schema_template.replace(TENANT_ID_PAT, "")
-
+        schema = _replace_tenant_template_value_in_schema(schema_template, "")
         schema = _replace_template_values_in_schema(
-            schema_template,
+            schema,
             self.index_name,
             primary_embedding_dim,
             primary_embedding_precision,
         )
 
         schema = add_ngrams_to_schema(schema) if needs_reindexing else schema
-        schema = schema.replace(TENANT_ID_PAT, "")
         zip_dict[f"schemas/{schema_names[0]}.sd"] = schema.encode("utf-8")
 
         if self.secondary_index_name:
@@ -352,9 +357,14 @@ def register_multitenant_indices(
             schema = _replace_template_values_in_schema(
                 schema_template, index_name, embedding_dim, embedding_precision
             )
-            schema = schema.replace(
-                TENANT_ID_PAT, TENANT_ID_REPLACEMENT if MULTI_TENANT else ""
+
+            tenant_id_replacement = ""
+            if MULTI_TENANT:
+                tenant_id_replacement = TENANT_ID_REPLACEMENT
+            schema = _replace_tenant_template_value_in_schema(
+                schema, tenant_id_replacement
             )
+
             schema = add_ngrams_to_schema(schema) if needs_reindexing else schema
             zip_dict[f"schemas/{index_name}.sd"] = schema.encode("utf-8")
 

@@ -0,0 +1,48 @@
+"""Tool to generate all supported schema variations for Onyx Cloud's Vespa database."""
+
+import argparse
+
+from onyx.db.enums import EmbeddingPrecision
+from onyx.document_index.vespa.index import _replace_template_values_in_schema
+from onyx.document_index.vespa.index import _replace_tenant_template_value_in_schema
+from onyx.document_index.vespa_constants import TENANT_ID_REPLACEMENT
+from onyx.utils.logger import setup_logger
+from shared_configs.configs import SUPPORTED_EMBEDDING_MODELS
+
+logger = setup_logger()
+
+
+def write_schema(index_name: str, dim: int, template: str) -> None:
+    index_filename = index_name + ".sd"
+    index_rendered_str = _replace_tenant_template_value_in_schema(
+        template, TENANT_ID_REPLACEMENT
+    )
+    index_rendered_str = _replace_template_values_in_schema(
+        index_rendered_str, index_name, dim, EmbeddingPrecision.FLOAT
+    )
+
+    with open(index_filename, "w", encoding="utf-8") as f:
+        f.write(index_rendered_str)
+
+    logger.info(f"Wrote {index_filename}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate multi tenant Vespa schemas")
+    parser.add_argument("--template", help="The schema template to use", required=True)
+    args = parser.parse_args()
+
+    with open(args.template, "r", encoding="utf-8") as f:
+        template_str = f.read()
+
+    num_indexes = 0
+    for model in SUPPORTED_EMBEDDING_MODELS:
+        write_schema(model.index_name, model.dim, template_str)
+        write_schema(model.index_name + "__danswer_alt_index", model.dim, template_str)
+        num_indexes += 2
+
+    logger.info(f"Wrote {num_indexes} indexes.")
+
+
+if __name__ == "__main__":
+    main()