diff --git a/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py b/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py index c96425e..44be7f0 100644 --- a/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py +++ b/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py @@ -212,7 +212,7 @@ def get_mark_up_cleaner_skill(self, chunk_by_page: False) -> WebApiSkill: batch_size = 2 degree_of_parallelism = 2 else: - batch_size = 16 + batch_size = 4 degree_of_parallelism = 16 if chunk_by_page: @@ -273,10 +273,10 @@ def get_mark_up_cleaner_skill(self, chunk_by_page: False) -> WebApiSkill: def get_semantic_chunker_skill( self, - num_surrounding_sentences: int = 1, + num_surrounding_sentences: int = 2, similarity_threshold: float = 0.8, - max_chunk_tokens: int = 200, - min_chunk_tokens: int = 50, + max_chunk_tokens: int = 500, + min_chunk_tokens: int = 150, ) -> SplitSkill: """Get the skill for text split. @@ -296,7 +296,7 @@ def get_semantic_chunker_skill( batch_size = 2 degree_of_parallelism = 2 else: - batch_size = 16 + batch_size = 8 degree_of_parallelism = 16 semantic_text_chunker_skill_inputs = [ diff --git a/deploy_ai_search_indexes/src/deploy_ai_search_indexes/image_processing.py b/deploy_ai_search_indexes/src/deploy_ai_search_indexes/image_processing.py index d5645b8..eb11fba 100644 --- a/deploy_ai_search_indexes/src/deploy_ai_search_indexes/image_processing.py +++ b/deploy_ai_search_indexes/src/deploy_ai_search_indexes/image_processing.py @@ -301,10 +301,10 @@ def get_indexer(self) -> SearchIndexer: # Only place on schedule if it is not a test deployment if self.test: schedule = None - batch_size = 4 + batch_size = 1 else: schedule = {"interval": "PT15M"} - batch_size = 16 + batch_size = 2 if self.environment.use_private_endpoint: execution_environment = IndexerExecutionEnvironment.PRIVATE diff --git a/image_processing/src/image_processing/semantic_text_chunker.py b/image_processing/src/image_processing/semantic_text_chunker.py index e51f980..5a2c5b6 100644 --- a/image_processing/src/image_processing/semantic_text_chunker.py +++ b/image_processing/src/image_processing/semantic_text_chunker.py @@ -12,10 +12,10 @@ class SemanticTextChunker: def __init__( self, - num_surrounding_sentences: int = 1, + num_surrounding_sentences: int = 2, similarity_threshold: float = 0.8, - max_chunk_tokens: int = 200, - min_chunk_tokens: int = 50, + max_chunk_tokens: int = 500, + min_chunk_tokens: int = 150, ): self.num_surrounding_sentences = num_surrounding_sentences self.similarity_threshold = similarity_threshold