Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -111,16 +111,16 @@ schema DANSWER_CHUNK_NAME {
indexing: summary | attribute
}
field primary_owners type array<string> {
indexing : summary | attribute
indexing: summary | attribute
}
field secondary_owners type array<string> {
indexing : summary | attribute
indexing: summary | attribute
}
field access_control_list type weightedset<string> {
indexing: summary | attribute
rank: filter
attribute: fast-search
}
}
field document_sets type weightedset<string> {
indexing: summary | attribute
rank: filter
Expand Down Expand Up @@ -149,7 +149,7 @@ schema DANSWER_CHUNK_NAME {

rank-profile default_rank {
inputs {
query(decay_factor) float
query(decay_factor) double
}

function inline document_boost() {
Expand Down Expand Up @@ -318,10 +318,4 @@ schema DANSWER_CHUNK_NAME {
expression: bm25(content) + (5 * bm25(title))
}
}

rank-profile random_ {
first-phase {
expression: random.match
}
}
}
22 changes: 16 additions & 6 deletions backend/onyx/document_index/vespa/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,13 @@ def _replace_template_values_in_schema(
)


def _replace_tenant_template_value_in_schema(
schema_template: str,
tenant_field: str,
) -> str:
return schema_template.replace(TENANT_ID_PAT, tenant_field)


def add_ngrams_to_schema(schema_content: str) -> str:
# Add the match blocks containing gram and gram-size to title and content fields
schema_content = re.sub(
Expand Down Expand Up @@ -242,17 +249,15 @@ def ensure_indices_exist(

with open(schema_file, "r") as schema_f:
schema_template = schema_f.read()
schema_template = schema_template.replace(TENANT_ID_PAT, "")

schema = _replace_tenant_template_value_in_schema(schema_template, "")
schema = _replace_template_values_in_schema(
schema_template,
schema,
self.index_name,
primary_embedding_dim,
primary_embedding_precision,
)

schema = add_ngrams_to_schema(schema) if needs_reindexing else schema
schema = schema.replace(TENANT_ID_PAT, "")
zip_dict[f"schemas/{schema_names[0]}.sd"] = schema.encode("utf-8")

if self.secondary_index_name:
Expand Down Expand Up @@ -352,9 +357,14 @@ def register_multitenant_indices(
schema = _replace_template_values_in_schema(
schema_template, index_name, embedding_dim, embedding_precision
)
schema = schema.replace(
TENANT_ID_PAT, TENANT_ID_REPLACEMENT if MULTI_TENANT else ""

tenant_id_replacement = ""
if MULTI_TENANT:
tenant_id_replacement = TENANT_ID_REPLACEMENT
schema = _replace_tenant_template_value_in_schema(
schema, tenant_id_replacement
)

schema = add_ngrams_to_schema(schema) if needs_reindexing else schema
zip_dict[f"schemas/{index_name}.sd"] = schema.encode("utf-8")

Expand Down
48 changes: 48 additions & 0 deletions backend/scripts/debugging/onyx_vespa_schemas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""Tool to generate all supported schema variations for Onyx Cloud's Vespa database."""

import argparse

from onyx.db.enums import EmbeddingPrecision
from onyx.document_index.vespa.index import _replace_template_values_in_schema
from onyx.document_index.vespa.index import _replace_tenant_template_value_in_schema
from onyx.document_index.vespa_constants import TENANT_ID_REPLACEMENT
from onyx.utils.logger import setup_logger
from shared_configs.configs import SUPPORTED_EMBEDDING_MODELS

logger = setup_logger()


def write_schema(index_name: str, dim: int, template: str) -> None:
index_filename = index_name + ".sd"
index_rendered_str = _replace_tenant_template_value_in_schema(
template, TENANT_ID_REPLACEMENT
)
index_rendered_str = _replace_template_values_in_schema(
index_rendered_str, index_name, dim, EmbeddingPrecision.FLOAT
)

with open(index_filename, "w", encoding="utf-8") as f:
f.write(index_rendered_str)

logger.info(f"Wrote {index_filename}")


def main() -> None:
parser = argparse.ArgumentParser(description="Generate multi tenant Vespa schemas")
parser.add_argument("--template", help="The schema template to use", required=True)
args = parser.parse_args()

with open(args.template, "r", encoding="utf-8") as f:
template_str = f.read()

num_indexes = 0
for model in SUPPORTED_EMBEDDING_MODELS:
write_schema(model.index_name, model.dim, template_str)
write_schema(model.index_name + "__danswer_alt_index", model.dim, template_str)
num_indexes += 2

logger.info(f"Wrote {num_indexes} indexes.")


if __name__ == "__main__":
main()
Loading