Skip to content

Commit e9a0b8e

Browse files
committed
adi and indexer changes
1 parent b4b1409 commit e9a0b8e

22 files changed

+1087
-231
lines changed

ai_search_with_adi/ai_search.py renamed to ai_search_with_adi/ai_search/ai_search.py

Lines changed: 67 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
14
from abc import ABC, abstractmethod
25
from azure.search.documents.indexes.models import (
36
SearchIndex,
@@ -28,7 +31,7 @@
2831
)
2932
from azure.core.exceptions import HttpResponseError
3033
from azure.search.documents.indexes import SearchIndexerClient, SearchIndexClient
31-
from environment import (
34+
from ai_search_with_adi.ai_search.environment import (
3235
get_fq_blob_connection_string,
3336
get_blob_container_name,
3437
get_custom_skill_function_url,
@@ -70,31 +73,48 @@ def __init__(
7073

7174
@property
7275
def indexer_name(self):
76+
"""Get the indexer name for the indexer."""
7377
return f"{str(self.indexer_type.value)}-indexer{self.suffix}"
7478

7579
@property
7680
def skillset_name(self):
81+
"""Get the skillset name for the indexer."""
7782
return f"{str(self.indexer_type.value)}-skillset{self.suffix}"
7883

7984
@property
8085
def semantic_config_name(self):
86+
"""Get the semantic config name for the indexer."""
8187
return f"{str(self.indexer_type.value)}-semantic-config{self.suffix}"
8288

8389
@property
8490
def index_name(self):
91+
"""Get the index name for the indexer."""
8592
return f"{str(self.indexer_type.value)}-index{self.suffix}"
8693

8794
@property
8895
def data_source_name(self):
96+
"""Get the data source name for the indexer."""
8997
blob_container_name = get_blob_container_name(self.indexer_type)
9098
return f"{blob_container_name}-data-source{self.suffix}"
9199

92100
@property
93101
def vector_search_profile_name(self):
102+
"""Get the vector search profile name for the indexer."""
94103
return (
95104
f"{str(self.indexer_type.value)}-compass-vector-search-profile{self.suffix}"
96105
)
97106

107+
@property
108+
def vectorizer_name(self):
109+
"""Get the vectorizer name."""
110+
return f"{str(self.indexer_type.value)}-compass-vectorizer{self.suffix}"
111+
112+
@property
113+
def algorithm_name(self):
114+
"""Gtt the algorithm name"""
115+
116+
return f"{str(self.indexer_type.value)}-hnsw-algorithm{self.suffix}"
117+
98118
@abstractmethod
99119
def get_index_fields(self) -> list[SearchableField]:
100120
"""Get the index fields for the indexer.
@@ -122,6 +142,7 @@ def get_index_projections(self):
122142
return None
123143

124144
def get_synonym_map_names(self):
145+
"""Get the synonym map names for the indexer."""
125146
return []
126147

127148
def get_user_assigned_managed_identity(
@@ -292,67 +313,7 @@ def get_text_split_skill(self, context, source) -> SplitSkill:
292313

293314
return text_split_skill
294315

295-
def get_custom_text_split_skill(
296-
self,
297-
context,
298-
source,
299-
text_split_mode="semantic",
300-
maximum_page_length=1000,
301-
separator=" ",
302-
initial_threshold=0.7,
303-
appending_threshold=0.6,
304-
merging_threshold=0.6,
305-
) -> WebApiSkill:
306-
"""Get the custom skill for text split.
307-
308-
Args:
309-
-----
310-
context (str): The context of the skill
311-
inputs (List[InputFieldMappingEntry]): The inputs of the skill
312-
outputs (List[OutputFieldMappingEntry]): The outputs of the skill
313-
314-
Returns:
315-
--------
316-
WebApiSkill: The custom skill for text split"""
317-
318-
if self.test:
319-
batch_size = 2
320-
degree_of_parallelism = 2
321-
else:
322-
batch_size = 2
323-
degree_of_parallelism = 6
324-
325-
text_split_skill_inputs = [
326-
InputFieldMappingEntry(name="text", source=source),
327-
]
328-
329-
headers = {
330-
"text_split_mode": text_split_mode,
331-
"maximum_page_length": maximum_page_length,
332-
"separator": separator,
333-
"initial_threshold": initial_threshold,
334-
"appending_threshold": appending_threshold,
335-
"merging_threshold": merging_threshold,
336-
}
337-
338-
text_split_skill = WebApiSkill(
339-
name="Text Split Skill",
340-
description="Skill to split the text before sending to embedding",
341-
context=context,
342-
uri=get_custom_skill_function_url("split"),
343-
timeout="PT230S",
344-
batch_size=batch_size,
345-
degree_of_parallelism=degree_of_parallelism,
346-
http_method="POST",
347-
http_headers=headers,
348-
inputs=text_split_skill_inputs,
349-
outputs=[OutputFieldMappingEntry(name="chunks", target_name="pages")],
350-
auth_resource_id=get_function_app_authresourceid(),
351-
auth_identity=self.get_user_assigned_managed_identity(),
352-
)
353-
354-
return text_split_skill
355-
316+
356317
def get_adi_skill(self, chunk_by_page=False) -> WebApiSkill:
357318
"""Get the custom skill for adi.
358319
@@ -400,6 +361,46 @@ def get_adi_skill(self, chunk_by_page=False) -> WebApiSkill:
400361

401362
return adi_skill
402363

364+
def get_excel_skill(self) -> WebApiSkill:
365+
"""Get the custom skill for adi.
366+
367+
Returns:
368+
--------
369+
WebApiSkill: The custom skill for adi"""
370+
371+
if self.test:
372+
batch_size = 1
373+
degree_of_parallelism = 4
374+
else:
375+
batch_size = 1
376+
degree_of_parallelism = 8
377+
378+
output = [
379+
OutputFieldMappingEntry(name="extracted_content", target_name="pages")
380+
]
381+
382+
xlsx_skill = WebApiSkill(
383+
name="XLSX Skill",
384+
description="Skill to generate Markdown from XLSX",
385+
context="/document",
386+
uri=get_custom_skill_function_url("xlsx"),
387+
timeout="PT230S",
388+
batch_size=batch_size,
389+
degree_of_parallelism=degree_of_parallelism,
390+
http_method="POST",
391+
http_headers={},
392+
inputs=[
393+
InputFieldMappingEntry(
394+
name="source", source="/document/metadata_storage_path"
395+
)
396+
],
397+
outputs=output,
398+
auth_resource_id=get_function_app_authresourceid(),
399+
auth_identity=self.get_user_assigned_managed_identity(),
400+
)
401+
402+
return xlsx_skill
403+
403404
def get_key_phrase_extraction_skill(self, context, source) -> WebApiSkill:
404405
"""Get the key phrase extraction skill.
405406
@@ -570,25 +571,21 @@ def get_compass_vector_search(self) -> VectorSearch:
570571
Returns:
571572
VectorSearch: The vector search configuration
572573
"""
573-
vectorizer_name = (
574-
f"{str(self.indexer_type.value)}-compass-vectorizer{self.suffix}"
575-
)
576-
algorithim_name = f"{str(self.indexer_type.value)}-hnsw-algorithm{self.suffix}"
577574

578575
vector_search = VectorSearch(
579576
algorithms=[
580-
HnswAlgorithmConfiguration(name=algorithim_name),
577+
HnswAlgorithmConfiguration(name=self.algorithm_name),
581578
],
582579
profiles=[
583580
VectorSearchProfile(
584581
name=self.vector_search_profile_name,
585-
algorithm_configuration_name=algorithim_name,
586-
vectorizer=vectorizer_name,
582+
algorithm_configuration_name=self.algorithm_name,
583+
vectorizer=self.vectorizer_name,
587584
)
588585
],
589586
vectorizers=[
590587
CustomVectorizer(
591-
name=vectorizer_name,
588+
name=self.vectorizer_name,
592589
custom_web_api_parameters=CustomWebApiParameters(
593590
uri=get_custom_skill_function_url("compass"),
594591
auth_resource_id=get_function_app_authresourceid(),

ai_search_with_adi/deploy.py renamed to ai_search_with_adi/ai_search/deploy.py

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,45 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
14
import argparse
2-
from environment import get_search_endpoint, get_managed_identity_id, get_search_key,get_key_vault_url
5+
from ai_search_with_adi.ai_search.environment import (
6+
get_search_endpoint,
7+
get_managed_identity_id,
8+
get_search_key,
9+
get_key_vault_url,
10+
)
311
from azure.core.credentials import AzureKeyCredential
4-
from azure.identity import DefaultAzureCredential,ManagedIdentityCredential,EnvironmentCredential
12+
from azure.identity import DefaultAzureCredential
513
from azure.keyvault.secrets import SecretClient
614
from inquiry_document import InquiryDocumentAISearch
715

8-
916
def main(args):
1017
endpoint = get_search_endpoint()
1118

1219
try:
13-
credential = DefaultAzureCredential(managed_identity_client_id =get_managed_identity_id())
20+
credential = DefaultAzureCredential(
21+
managed_identity_client_id=get_managed_identity_id()
22+
)
1423
# initializing key vault client
1524
client = SecretClient(vault_url=get_key_vault_url(), credential=credential)
1625
print("Using managed identity credential")
1726
except Exception as e:
1827
print(e)
19-
credential = (
20-
AzureKeyCredential(get_search_key(client=client))
21-
)
28+
credential = AzureKeyCredential(get_search_key(client=client))
2229
print("Using Azure Key credential")
2330

2431
if args.indexer_type == "inquiry":
2532
# Deploy the inquiry index
2633
index_config = InquiryDocumentAISearch(
27-
endpoint=endpoint,
28-
credential=credential,
34+
endpoint=endpoint,
35+
credential=credential,
2936
suffix=args.suffix,
30-
rebuild=args.rebuild,
31-
enable_page_by_chunking=args.enable_page_chunking
37+
rebuild=args.rebuild,
38+
enable_page_by_chunking=args.enable_page_chunking,
3239
)
40+
else:
41+
raise ValueError("Invalid Indexer Type")
42+
3343
index_config.deploy()
3444

3545
if args.rebuild:
@@ -42,7 +52,7 @@ def main(args):
4252
"--indexer_type",
4353
type=str,
4454
required=True,
45-
help="Type of Indexer want to deploy. inquiry/summary/glossary",
55+
help="Type of Indexer want to deploy.",
4656
)
4757
parser.add_argument(
4858
"--rebuild",

ai_search_with_adi/environment.py renamed to ai_search_with_adi/ai_search/environment.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT License.
3+
14
"""Module providing environment definition"""
25
import os
36
from dotenv import find_dotenv, load_dotenv

0 commit comments

Comments
 (0)