|
| 1 | +# Copyright (c) Microsoft Corporation. |
| 2 | +# Licensed under the MIT License. |
| 3 | + |
1 | 4 | from abc import ABC, abstractmethod
|
2 | 5 | from azure.search.documents.indexes.models import (
|
3 | 6 | SearchIndex,
|
|
28 | 31 | )
|
29 | 32 | from azure.core.exceptions import HttpResponseError
|
30 | 33 | from azure.search.documents.indexes import SearchIndexerClient, SearchIndexClient
|
31 |
| -from environment import ( |
| 34 | +from ai_search_with_adi.ai_search.environment import ( |
32 | 35 | get_fq_blob_connection_string,
|
33 | 36 | get_blob_container_name,
|
34 | 37 | get_custom_skill_function_url,
|
@@ -70,31 +73,48 @@ def __init__(
|
70 | 73 |
|
71 | 74 | @property
|
72 | 75 | def indexer_name(self):
|
| 76 | + """Get the indexer name for the indexer.""" |
73 | 77 | return f"{str(self.indexer_type.value)}-indexer{self.suffix}"
|
74 | 78 |
|
75 | 79 | @property
|
76 | 80 | def skillset_name(self):
|
| 81 | + """Get the skillset name for the indexer.""" |
77 | 82 | return f"{str(self.indexer_type.value)}-skillset{self.suffix}"
|
78 | 83 |
|
79 | 84 | @property
|
80 | 85 | def semantic_config_name(self):
|
| 86 | + """Get the semantic config name for the indexer.""" |
81 | 87 | return f"{str(self.indexer_type.value)}-semantic-config{self.suffix}"
|
82 | 88 |
|
83 | 89 | @property
|
84 | 90 | def index_name(self):
|
| 91 | + """Get the index name for the indexer.""" |
85 | 92 | return f"{str(self.indexer_type.value)}-index{self.suffix}"
|
86 | 93 |
|
87 | 94 | @property
|
88 | 95 | def data_source_name(self):
|
| 96 | + """Get the data source name for the indexer.""" |
89 | 97 | blob_container_name = get_blob_container_name(self.indexer_type)
|
90 | 98 | return f"{blob_container_name}-data-source{self.suffix}"
|
91 | 99 |
|
92 | 100 | @property
|
93 | 101 | def vector_search_profile_name(self):
|
| 102 | + """Get the vector search profile name for the indexer.""" |
94 | 103 | return (
|
95 | 104 | f"{str(self.indexer_type.value)}-compass-vector-search-profile{self.suffix}"
|
96 | 105 | )
|
97 | 106 |
|
| 107 | + @property |
| 108 | + def vectorizer_name(self): |
| 109 | + """Get the vectorizer name.""" |
| 110 | + return f"{str(self.indexer_type.value)}-compass-vectorizer{self.suffix}" |
| 111 | + |
| 112 | + @property |
| 113 | + def algorithm_name(self): |
| 114 | + """Gtt the algorithm name""" |
| 115 | + |
| 116 | + return f"{str(self.indexer_type.value)}-hnsw-algorithm{self.suffix}" |
| 117 | + |
98 | 118 | @abstractmethod
|
99 | 119 | def get_index_fields(self) -> list[SearchableField]:
|
100 | 120 | """Get the index fields for the indexer.
|
@@ -122,6 +142,7 @@ def get_index_projections(self):
|
122 | 142 | return None
|
123 | 143 |
|
124 | 144 | def get_synonym_map_names(self):
|
| 145 | + """Get the synonym map names for the indexer.""" |
125 | 146 | return []
|
126 | 147 |
|
127 | 148 | def get_user_assigned_managed_identity(
|
@@ -292,67 +313,7 @@ def get_text_split_skill(self, context, source) -> SplitSkill:
|
292 | 313 |
|
293 | 314 | return text_split_skill
|
294 | 315 |
|
295 |
| - def get_custom_text_split_skill( |
296 |
| - self, |
297 |
| - context, |
298 |
| - source, |
299 |
| - text_split_mode="semantic", |
300 |
| - maximum_page_length=1000, |
301 |
| - separator=" ", |
302 |
| - initial_threshold=0.7, |
303 |
| - appending_threshold=0.6, |
304 |
| - merging_threshold=0.6, |
305 |
| - ) -> WebApiSkill: |
306 |
| - """Get the custom skill for text split. |
307 |
| -
|
308 |
| - Args: |
309 |
| - ----- |
310 |
| - context (str): The context of the skill |
311 |
| - inputs (List[InputFieldMappingEntry]): The inputs of the skill |
312 |
| - outputs (List[OutputFieldMappingEntry]): The outputs of the skill |
313 |
| -
|
314 |
| - Returns: |
315 |
| - -------- |
316 |
| - WebApiSkill: The custom skill for text split""" |
317 |
| - |
318 |
| - if self.test: |
319 |
| - batch_size = 2 |
320 |
| - degree_of_parallelism = 2 |
321 |
| - else: |
322 |
| - batch_size = 2 |
323 |
| - degree_of_parallelism = 6 |
324 |
| - |
325 |
| - text_split_skill_inputs = [ |
326 |
| - InputFieldMappingEntry(name="text", source=source), |
327 |
| - ] |
328 |
| - |
329 |
| - headers = { |
330 |
| - "text_split_mode": text_split_mode, |
331 |
| - "maximum_page_length": maximum_page_length, |
332 |
| - "separator": separator, |
333 |
| - "initial_threshold": initial_threshold, |
334 |
| - "appending_threshold": appending_threshold, |
335 |
| - "merging_threshold": merging_threshold, |
336 |
| - } |
337 |
| - |
338 |
| - text_split_skill = WebApiSkill( |
339 |
| - name="Text Split Skill", |
340 |
| - description="Skill to split the text before sending to embedding", |
341 |
| - context=context, |
342 |
| - uri=get_custom_skill_function_url("split"), |
343 |
| - timeout="PT230S", |
344 |
| - batch_size=batch_size, |
345 |
| - degree_of_parallelism=degree_of_parallelism, |
346 |
| - http_method="POST", |
347 |
| - http_headers=headers, |
348 |
| - inputs=text_split_skill_inputs, |
349 |
| - outputs=[OutputFieldMappingEntry(name="chunks", target_name="pages")], |
350 |
| - auth_resource_id=get_function_app_authresourceid(), |
351 |
| - auth_identity=self.get_user_assigned_managed_identity(), |
352 |
| - ) |
353 |
| - |
354 |
| - return text_split_skill |
355 |
| - |
| 316 | + |
356 | 317 | def get_adi_skill(self, chunk_by_page=False) -> WebApiSkill:
|
357 | 318 | """Get the custom skill for adi.
|
358 | 319 |
|
@@ -400,6 +361,46 @@ def get_adi_skill(self, chunk_by_page=False) -> WebApiSkill:
|
400 | 361 |
|
401 | 362 | return adi_skill
|
402 | 363 |
|
| 364 | + def get_excel_skill(self) -> WebApiSkill: |
| 365 | + """Get the custom skill for adi. |
| 366 | +
|
| 367 | + Returns: |
| 368 | + -------- |
| 369 | + WebApiSkill: The custom skill for adi""" |
| 370 | + |
| 371 | + if self.test: |
| 372 | + batch_size = 1 |
| 373 | + degree_of_parallelism = 4 |
| 374 | + else: |
| 375 | + batch_size = 1 |
| 376 | + degree_of_parallelism = 8 |
| 377 | + |
| 378 | + output = [ |
| 379 | + OutputFieldMappingEntry(name="extracted_content", target_name="pages") |
| 380 | + ] |
| 381 | + |
| 382 | + xlsx_skill = WebApiSkill( |
| 383 | + name="XLSX Skill", |
| 384 | + description="Skill to generate Markdown from XLSX", |
| 385 | + context="/document", |
| 386 | + uri=get_custom_skill_function_url("xlsx"), |
| 387 | + timeout="PT230S", |
| 388 | + batch_size=batch_size, |
| 389 | + degree_of_parallelism=degree_of_parallelism, |
| 390 | + http_method="POST", |
| 391 | + http_headers={}, |
| 392 | + inputs=[ |
| 393 | + InputFieldMappingEntry( |
| 394 | + name="source", source="/document/metadata_storage_path" |
| 395 | + ) |
| 396 | + ], |
| 397 | + outputs=output, |
| 398 | + auth_resource_id=get_function_app_authresourceid(), |
| 399 | + auth_identity=self.get_user_assigned_managed_identity(), |
| 400 | + ) |
| 401 | + |
| 402 | + return xlsx_skill |
| 403 | + |
403 | 404 | def get_key_phrase_extraction_skill(self, context, source) -> WebApiSkill:
|
404 | 405 | """Get the key phrase extraction skill.
|
405 | 406 |
|
@@ -570,25 +571,21 @@ def get_compass_vector_search(self) -> VectorSearch:
|
570 | 571 | Returns:
|
571 | 572 | VectorSearch: The vector search configuration
|
572 | 573 | """
|
573 |
| - vectorizer_name = ( |
574 |
| - f"{str(self.indexer_type.value)}-compass-vectorizer{self.suffix}" |
575 |
| - ) |
576 |
| - algorithim_name = f"{str(self.indexer_type.value)}-hnsw-algorithm{self.suffix}" |
577 | 574 |
|
578 | 575 | vector_search = VectorSearch(
|
579 | 576 | algorithms=[
|
580 |
| - HnswAlgorithmConfiguration(name=algorithim_name), |
| 577 | + HnswAlgorithmConfiguration(name=self.algorithm_name), |
581 | 578 | ],
|
582 | 579 | profiles=[
|
583 | 580 | VectorSearchProfile(
|
584 | 581 | name=self.vector_search_profile_name,
|
585 |
| - algorithm_configuration_name=algorithim_name, |
586 |
| - vectorizer=vectorizer_name, |
| 582 | + algorithm_configuration_name=self.algorithm_name, |
| 583 | + vectorizer=self.vectorizer_name, |
587 | 584 | )
|
588 | 585 | ],
|
589 | 586 | vectorizers=[
|
590 | 587 | CustomVectorizer(
|
591 |
| - name=vectorizer_name, |
| 588 | + name=self.vectorizer_name, |
592 | 589 | custom_web_api_parameters=CustomWebApiParameters(
|
593 | 590 | uri=get_custom_skill_function_url("compass"),
|
594 | 591 | auth_resource_id=get_function_app_authresourceid(),
|
|
0 commit comments