Skip to content

Commit b6ac2f1

Browse files
Merge branch 'main' into feature/local_test
2 parents cdcc0e2 + 555aa92 commit b6ac2f1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+2411
-87
lines changed

deploy_ai_search/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ The associated scripts in this portion of the repository contains pre-built scri
2424
- `index_type text_2_sql`. This selects the `Text2SQLAISearch` sub class.
2525
- `rebuild`. Whether to delete and rebuild the index.
2626
- `suffix`. Optional parameter that will apply a suffix onto the deployed index and indexer. This is useful if you want deploy a test version, before overwriting the main version.
27+
- `single_data_dictionary`. Optional parameter that controls whether you will be uploading a single data dictionary, or a data dictionary file per entity. By default, this is set to False.
2728

2829
### Query Cache Index
2930

deploy_ai_search/deploy.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ def deploy_config(arguments: argparse.Namespace):
2323
)
2424
elif arguments.index_type == "text_2_sql":
2525
index_config = Text2SqlAISearch(
26-
suffix=arguments.suffix, rebuild=arguments.rebuild
26+
suffix=arguments.suffix,
27+
rebuild=arguments.rebuild,
28+
single_data_dictionary=arguments.single_data_dictionary,
2729
)
2830
elif arguments.index_type == "text_2_sql_query_cache":
2931
index_config = Text2SqlQueryCacheAISearch(
@@ -58,6 +60,12 @@ def deploy_config(arguments: argparse.Namespace):
5860
required=False,
5961
help="Whether want to enable chunking by page in adi skill, if no value is passed considered False",
6062
)
63+
parser.add_argument(
64+
"--single_data_dictionary",
65+
type=bool,
66+
required=False,
67+
help="Whether or not a single data dictionary file should be uploaded, or one per entity",
68+
)
6169
parser.add_argument(
6270
"--suffix",
6371
type=str,

deploy_ai_search/rag_documents.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,15 +101,17 @@ def get_index_fields(self) -> list[SearchableField]:
101101
name="Figures",
102102
collection=True,
103103
fields=[
104-
SimpleField(
104+
SearchableField(
105105
name="FigureId",
106106
type=SearchFieldDataType.String,
107107
collection=True,
108+
searchable=False,
108109
),
109-
SimpleField(
110+
SearchableField(
110111
name="FigureUri",
111112
type=SearchFieldDataType.String,
112113
collection=True,
114+
searchable=False,
113115
),
114116
],
115117
),

deploy_ai_search/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
python-dotenv
2-
azure-search-documents==11.6.0b5
2+
azure-search-documents==11.6.0b6
33
azure-storage-blob
44
azure-identity
55
azure-mgmt-web

deploy_ai_search/text_2_sql.py

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
BlobIndexerDataToExtract,
1919
IndexerExecutionEnvironment,
2020
BlobIndexerParsingMode,
21+
FieldMappingFunction,
2122
)
2223
from ai_search import AISearch
2324
from environment import (
@@ -28,7 +29,12 @@
2829
class Text2SqlAISearch(AISearch):
2930
"""This class is used to deploy the sql index."""
3031

31-
def __init__(self, suffix: str | None = None, rebuild: bool | None = False):
32+
def __init__(
33+
self,
34+
suffix: str | None = None,
35+
rebuild: bool | None = False,
36+
single_data_dictionary: bool | None = False,
37+
):
3238
"""Initialize the Text2SqlAISearch class. This class implements the deployment of the sql index.
3339
3440
Args:
@@ -38,7 +44,10 @@ def __init__(self, suffix: str | None = None, rebuild: bool | None = False):
3844
self.indexer_type = IndexerType.TEXT_2_SQL
3945
super().__init__(suffix, rebuild)
4046

41-
self.parsing_mode = BlobIndexerParsingMode.JSON_ARRAY
47+
if single_data_dictionary:
48+
self.parsing_mode = BlobIndexerParsingMode.JSON_ARRAY
49+
else:
50+
self.parsing_mode = BlobIndexerParsingMode.JSON
4251

4352
def get_index_fields(self) -> list[SearchableField]:
4453
"""This function returns the index fields for sql index.
@@ -47,10 +56,15 @@ def get_index_fields(self) -> list[SearchableField]:
4756
list[SearchableField]: The index fields for sql index"""
4857

4958
fields = [
59+
SimpleField(
60+
name="Id",
61+
type=SearchFieldDataType.String,
62+
key=True,
63+
analyzer_name="keyword",
64+
),
5065
SearchableField(
5166
name="Entity",
5267
type=SearchFieldDataType.String,
53-
key=True,
5468
analyzer_name="keyword",
5569
),
5670
SearchableField(
@@ -76,15 +90,17 @@ def get_index_fields(self) -> list[SearchableField]:
7690
SearchableField(name="Name", type=SearchFieldDataType.String),
7791
SearchableField(name="Definition", type=SearchFieldDataType.String),
7892
SearchableField(name="Type", type=SearchFieldDataType.String),
79-
SimpleField(
93+
SearchableField(
8094
name="AllowedValues",
8195
type=SearchFieldDataType.String,
8296
collection=True,
97+
searchable=False,
8398
),
84-
SimpleField(
99+
SearchableField(
85100
name="SampleValues",
86101
type=SearchFieldDataType.String,
87102
collection=True,
103+
searchable=False,
88104
),
89105
],
90106
),
@@ -190,6 +206,14 @@ def get_indexer(self) -> SearchIndexer:
190206
)
191207
],
192208
output_field_mappings=[
209+
FieldMapping(
210+
source_field_name="/document/Entity",
211+
target_field_name="Id",
212+
mapping_function=FieldMappingFunction(
213+
name="base64Encode",
214+
parameters={"useHttpServerUtilityUrlTokenEncode": False},
215+
),
216+
),
193217
FieldMapping(
194218
source_field_name="/document/Entity", target_field_name="Entity"
195219
),

deploy_ai_search/text_2_sql_query_cache.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,15 +77,17 @@ def get_index_fields(self) -> list[SearchableField]:
7777
SearchableField(
7878
name="Type", type=SearchFieldDataType.String
7979
),
80-
SimpleField(
80+
SearchableField(
8181
name="AllowedValues",
8282
type=SearchFieldDataType.String,
8383
collection=True,
84+
searchable=False,
8485
),
85-
SimpleField(
86+
SearchableField(
8687
name="SampleValues",
8788
type=SearchFieldDataType.String,
8889
collection=True,
90+
searchable=False,
8991
),
9092
],
9193
),

documentation/IP-deck.pdf

2.61 MB
Binary file not shown.

text_2_sql/README.md

Lines changed: 13 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22

33
This portion of the repo contains code to implement a multi-shot approach to Text2SQL generation. This code can be integrated into a RAG application to allow the application to intelligently switch between different data sources (SQL, AI Search etc) to answer the question with the best possible information.
44

5-
The implementation is written for [Semantic Kernel](https://github.com/microsoft/semantic-kernel) in Python, although it can easily be adapted for C# or another framework such as LangChain.
6-
75
The sample provided works with Azure SQL Server, although it has been easily adapted to other SQL sources such as Snowflake.
86

97
**Three iterations on the approach are provided for SQL query generation. A prompt based approach and a two vector database based approaches. See Multi-Shot Approach for more details**
@@ -68,6 +66,10 @@ As the query cache is shared between users (no data is stored in the cache), a n
6866
|**Disadvantages** | Slows down significantly as the number of entities increases. | Uses LLM to detect the best fitting entity which is slow compared to a vector approach. | AI Search adds additional cost to the solution. | Slower than other approaches for the first time a question with no similar questions in the cache is asked. |
6967
| | Consumes a significant number of tokens as number of entities increases. | As number of entities increases, token usage will grow but at a lesser rate than Iteration 1. | | AI Search adds additional cost to the solution. |
7068
| | LLM struggled to differentiate which table to choose with the large amount of information passed. | | |
69+
|**Code Availability**| | | | |
70+
| Semantic Kernel | Yes :heavy_check_mark: | Yes :heavy_check_mark: | Yes :heavy_check_mark: | Yes :heavy_check_mark: |
71+
| LangChain | | | | |
72+
| AutoGen | | | | | |
7173

7274
### Complete Execution Time Comparison for Approaches
7375

@@ -140,22 +142,13 @@ The top-performing product by quantity of units sold is the **Classic Vest, S**
140142
|----------------|---------------|
141143
| Classic Vest, S| Classic Vest |
142144

143-
## Provided Notebooks & Scripts
144-
145-
- `./rag_with_prompt_based_text_2_sql.ipynb` provides example of how to utilise the Prompt Based Text2SQL plugin to query the database.
146-
- `./rag_with_vector_based_text_2_sql.ipynb` provides example of how to utilise the Vector Based Text2SQL plugin to query the database.
147-
- `./rag_with_vector_based_text_2_sql_query_cache.ipynb` provides example of how to utilise the Vector Based Text2SQL plugin, alongside the query cache, to query the database.
148-
- `./rag_with_ai_search_and_text_2_sql.ipynb` provides an example of how to use the Text2SQL and an AISearch plugin in parallel to automatically retrieve data from the most relevant source to answer the query.
149-
- This setup is useful for a production application as the SQL Database is unlikely to be able to answer all the questions a user may ask.
150-
- `./time_comparison_script.py` provides a utility script for performing time based comparisons between the different approaches.
151-
152145
## Data Dictionary
153146

154147
### entities.json
155148

156149
To power the knowledge of the LLM, a data dictionary containing all the SQL views / table metadata is used. Whilst the LLM could query the database at runtime to find out the schemas for the database, storing them in a text file reduces the overall latency of the system and allows the metadata for each table to be adjusted in a form of prompt engineering.
157150

158-
The data dictionary is stored in `./data_dictionary/entities.json`. Below is a sample entry for a view / table that we which to expose to the LLM. The Microsoft SQL Server [Adventure Works Database](https://learn.microsoft.com/en-us/sql/samples/adventureworks-install-configure?view=sql-server-ver16) is used as an sample.
151+
Below is a sample entry for a view / table that we which to expose to the LLM. The Microsoft SQL Server [Adventure Works Database](https://learn.microsoft.com/en-us/sql/samples/adventureworks-install-configure?view=sql-server-ver16) is used as an sample.
159152

160153
```json
161154
{
@@ -182,45 +175,14 @@ The data dictionary is stored in `./data_dictionary/entities.json`. Below is a s
182175
}
183176
```
184177

185-
#### Property Definitions
186-
- **EntityName** is a human readable name for the entity.
187-
- **Entity** is the actual name for the entity that is used in the SQL query.
188-
- **Description** provides a comprehensive description of what information the entity contains.
189-
- **Columns** contains a list of the columns exposed for querying. Each column contains:
190-
- **Definition** a short definition of what information the column contains. Here you can add extra metadata to **prompt engineer** the LLM to select the right columns or interpret the data in the column correctly.
191-
- **Name** is the actual column name.
192-
- **Type** is the datatype for the column.
193-
- **SampleValues (optional)** is a list of sample values that are in the column. This is useful for instructing the LLM of what format the data may be in.
194-
- **AllowedValues (optional)** is a list of absolute allowed values for the column. This instructs the LLM only to use these values if filtering against this column.
195-
196-
A full data dictionary must be built for all the views / tables you which to expose to the LLM. The metadata provide directly influences the accuracy of the Text2SQL component.
178+
See `./data_dictionary` for more details on how the data dictionary is structured and ways to **automatically generate it**.
197179

198180
## Prompt Based SQL Plugin (Iteration 2)
199181

200182
This approach works well for a small number of entities (tested on up to 20 entities with hundreds of columns). It performed well on the testing, with correct metadata, we achieved 100% accuracy on the test set.
201183

202184
Whilst a simple and high performing approach, the downside of this approach is the increase in number of tokens as the number of entities increases. Additionally, we found that the LLM started to get "confused" on which columns belong to which entities as the number of entities increased.
203185

204-
### prompt_based_sql_plugin.py
205-
206-
The `./plugins/prompt_based_sql_plugin/prompt_based_sql_plugin.py` contains 3 key methods to power the Prompt Based Text2SQL engine.
207-
208-
#### system_prompt()
209-
210-
This method takes the loaded `entities.json` file and generates a system prompt based on it. Here, the **EntityName** and **Description** are used to build a list of available entities for the LLM to select.
211-
212-
This is then inserted into a pre-made Text2SQL generation prompt that already contains optimised and working instructions for the LLM. This system prompt for the plugin is added to the main prompt file at runtime.
213-
214-
The **target_engine** is passed to the prompt, along with **engine_specific_rules** to ensure that the SQL queries generated work on the target engine.
215-
216-
#### get_entity_schema()
217-
218-
This method is called by the Semantic Kernel framework automatically, when instructed to do so by the LLM, to fetch the full schema definitions for a given entity. This returns a JSON string of the chosen entity which allows the LLM to understand the column definitions and their associated metadata. This can be called in parallel for multiple entities.
219-
220-
#### run_sql_query()
221-
222-
This method is called by the Semantic Kernel framework automatically, when instructed to do so by the LLM, to run a SQL query against the given database. It returns a JSON string containing a row wise dump of the results returned. These results are then interpreted to answer the question.
223-
224186
## Vector Based SQL Plugin (Iterations 3 & 4)
225187

226188
This approach allows the system to scale without significantly increasing the number of tokens used within the system prompt. Indexing and running an AI Search instance consumes additional cost, compared to the prompt based approach.
@@ -234,39 +196,15 @@ The following environmental variables control the behaviour of the Vector Based
234196
- **Text2Sql__UseQueryCache** - controls whether the query cached index is checked before using the standard schema index.
235197
- **Text2Sql__PreRunQueryCache** - controls whether the top result from the query cache index (if enabled) is pre-fetched against the data source to include the results in the prompt.
236198

237-
### vector_based_sql_plugin.py
238-
239-
The `./plugins/vector_based_sql_plugin/vector_based_sql_plugin.py` contains 3 key methods to power the Vector Based Text2SQL engine.
199+
## Code Availability
240200

241-
#### Indexing
242-
243-
`./deploy_ai_search/text_2_sql.py` & `./deploy_ai_search/text_2_sql_query_cache.py` contains the scripts to deploy and index the data dictionary for use within the plugin. See instructions in `./deploy_ai_search/README.md`.
244-
245-
#### system_prompt()
246-
247-
This method simply returns a pre-made system prompt that contains optimised and working instructions for the LLM. This system prompt for the plugin is added to the main prompt file at runtime.
248-
249-
The **target_engine** is passed to the prompt, along with **engine_specific_rules** to ensure that the SQL queries generated work on the target engine.
250-
251-
**If the query cache is enabled, the prompt is adjusted to instruct the LLM to look at the cached data and results first, before calling `get_entity_schema()`.**
252-
253-
#### get_entity_schema()
254-
255-
This method is called by the Semantic Kernel framework automatically, when instructed to do so by the LLM, to search the AI Search instance with the given text. The LLM is able to pass the key terms from the user query, and retrieve a ranked list of the most suitable entities to answer the question.
256-
257-
The search text passed is vectorised against the entity level **Description** columns. A hybrid Semantic Reranking search is applied against the **EntityName**, **Entity**, **Columns/Name** fields.
258-
259-
#### fetch_queries_from_cache()
260-
261-
The vector based with query cache uses the `fetch_queries_from_cache()` method to fetch the most relevant previous query and injects it into the prompt before the initial LLM call. The use of Auto-Function Calling here is avoided to reduce the response time as the cache index will always be used first.
262-
263-
If the score of the top result is higher than the defined threshold, the query will be executed against the target data source and the results included in the prompt. This allows us to prompt the LLM to evaluated whether it can use these results to answer the question, **without further SQL Query generation** to speed up the process.
264-
265-
#### run_sql_query()
266-
267-
This method is called by the Semantic Kernel framework automatically, when instructed to do so by the LLM, to run a SQL query against the given database. It returns a JSON string containing a row wise dump of the results returned. These results are then interpreted to answer the question.
201+
| | Common Text2SQL Approach | Prompt Based Multi-Shot Text2SQL Approach | Vector Based Multi-Shot Text2SQL Approach | Vector Based Multi-Shot Text2SQL Approach With Query Cache |
202+
|-|-|-|-|-|
203+
| Semantic Kernel | Yes :heavy_check_mark: | Yes :heavy_check_mark: | Yes :heavy_check_mark: | Yes :heavy_check_mark: |
204+
| LangChain | | | | |
205+
| AutoGen | | | | | |
268206

269-
Additionally, if any of the cache functionality is enabled, this method will update the query cache index based on the SQL query run, and the schemas used in execution.
207+
See the relevant directory for the code in the provided framework.
270208

271209
## Tips for good Text2SQL performance.
272210

File renamed without changes.

0 commit comments

Comments
 (0)