1
- """Tool to generate all supported schema variations for Onyx Cloud's Vespa database."""
1
+ """Tool to generate all supported schema variations for Onyx Cloud's Vespa database.
2
+
3
+ Usage:
4
+
5
+ ```
6
+ PYTHONPATH=. python scripts/debugging/onyx_vespa_schemas.py
7
+ ```
8
+
9
+ Then, paste them into the existing vespa schema downloaded from the Vespa console,
10
+ and then re-zip.
11
+ """
2
12
3
13
import argparse
14
+ import os
15
+ from pathlib import Path
4
16
5
17
import jinja2
6
18
@@ -16,8 +28,13 @@ def write_schema(
16
28
dim : int ,
17
29
embedding_precision : EmbeddingPrecision ,
18
30
template : jinja2 .Template ,
31
+ output_path : Path ,
19
32
) -> None :
20
- index_filename = index_name + ".sd"
33
+ # Create schemas directory if it doesn't exist
34
+ schemas_dir = output_path / "schemas"
35
+ schemas_dir .mkdir (parents = True , exist_ok = True )
36
+
37
+ index_filename = schemas_dir / (index_name + ".sd" )
21
38
22
39
schema = template .render (
23
40
multi_tenant = True ,
@@ -32,31 +49,106 @@ def write_schema(
32
49
logger .info (f"Wrote { index_filename } " )
33
50
34
51
52
+ def generate_document_entries () -> str :
53
+ """Generate document entries for all supported embedding models."""
54
+ document_entries = []
55
+
56
+ for model in SUPPORTED_EMBEDDING_MODELS :
57
+ # Add regular index
58
+ document_entries .append (
59
+ f' <document type="{ model .index_name } " mode="index" />'
60
+ )
61
+ # Add alt index
62
+ document_entries .append (
63
+ f' <document type="{ model .index_name } __danswer_alt_index" mode="index" />'
64
+ )
65
+
66
+ return "\n " .join (document_entries )
67
+
68
+
69
+ def write_cloud_services (cloud_services_template_path : str , output_path : Path ) -> None :
70
+ """Generate and write the cloud-services.xml file."""
71
+ # Create output directory if it doesn't exist
72
+ output_path .mkdir (parents = True , exist_ok = True )
73
+
74
+ jinja_env = jinja2 .Environment ()
75
+
76
+ with open (cloud_services_template_path , "r" , encoding = "utf-8" ) as f :
77
+ template_str = f .read ()
78
+
79
+ template = jinja_env .from_string (template_str )
80
+ document_entries = generate_document_entries ()
81
+
82
+ services_xml = template .render (document_elements = document_entries )
83
+
84
+ services_file = output_path / "services.xml"
85
+ with open (services_file , "w" , encoding = "utf-8" ) as f :
86
+ f .write (services_xml )
87
+
88
+ logger .info (f"Wrote { services_file } " )
89
+
90
+
35
91
def main () -> None :
36
- parser = argparse .ArgumentParser (description = "Generate multi tenant Vespa schemas" )
37
- parser .add_argument ("--template" , help = "The Jinja template to use" , required = True )
92
+ parser = argparse .ArgumentParser (
93
+ description = "Generate multi tenant Vespa schemas and services configuration"
94
+ )
95
+ parser .add_argument (
96
+ "--template" ,
97
+ help = "The Jinja template to use for schemas" ,
98
+ default = "onyx/document_index/vespa/app_config/schemas/danswer_chunk.sd.jinja" ,
99
+ )
100
+ parser .add_argument (
101
+ "--cloud-services-template" ,
102
+ help = "The cloud-services.xml.jinja template path" ,
103
+ default = "ee/onyx/document_index/vespa/app_config/cloud-services.xml.jinja" ,
104
+ )
105
+ parser .add_argument (
106
+ "--output-path" ,
107
+ help = "Output directory path (defaults to current directory)" ,
108
+ default = "." ,
109
+ )
38
110
args = parser .parse_args ()
39
111
112
+ # Convert output path to Path object
113
+ output_path = Path (args .output_path )
114
+
40
115
jinja_env = jinja2 .Environment ()
41
116
117
+ # Generate schema files
42
118
with open (args .template , "r" , encoding = "utf-8" ) as f :
43
119
template_str = f .read ()
44
120
45
121
template = jinja_env .from_string (template_str )
46
122
47
123
num_indexes = 0
48
124
for model in SUPPORTED_EMBEDDING_MODELS :
49
- write_schema (model .index_name , model .dim , model .embedding_precision , template )
125
+ write_schema (
126
+ model .index_name ,
127
+ model .dim ,
128
+ model .embedding_precision ,
129
+ template ,
130
+ output_path ,
131
+ )
50
132
write_schema (
51
133
model .index_name + "__danswer_alt_index" ,
52
134
model .dim ,
53
135
model .embedding_precision ,
54
136
template ,
137
+ output_path ,
55
138
)
56
139
num_indexes += 2
57
140
58
141
logger .info (f"Wrote { num_indexes } indexes." )
59
142
143
+ # Generate cloud services configuration if template is provided
144
+ if args .cloud_services_template :
145
+ if os .path .exists (args .cloud_services_template ):
146
+ write_cloud_services (args .cloud_services_template , output_path )
147
+ else :
148
+ logger .error (
149
+ f"Cloud services template not found: { args .cloud_services_template } "
150
+ )
151
+
60
152
61
153
if __name__ == "__main__" :
62
154
main ()
0 commit comments