From d6c47411ff24a52db2431b2d0d7c613b3e943c41 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Sun, 9 Feb 2025 14:10:16 +0000 Subject: [PATCH 1/4] Update --- .../src/deploy_ai_search_indexes/ai_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py b/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py index c96425e..5a1a330 100644 --- a/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py +++ b/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py @@ -212,7 +212,7 @@ def get_mark_up_cleaner_skill(self, chunk_by_page: False) -> WebApiSkill: batch_size = 2 degree_of_parallelism = 2 else: - batch_size = 16 + batch_size = 4 degree_of_parallelism = 16 if chunk_by_page: From c442539487d416e977d0483450ebce4c5f5ccf10 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Sun, 9 Feb 2025 14:11:34 +0000 Subject: [PATCH 2/4] Reduce batch size --- .../src/deploy_ai_search_indexes/ai_search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py b/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py index 5a1a330..56c2e90 100644 --- a/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py +++ b/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py @@ -296,7 +296,7 @@ def get_semantic_chunker_skill( batch_size = 2 degree_of_parallelism = 2 else: - batch_size = 16 + batch_size = 8 degree_of_parallelism = 16 semantic_text_chunker_skill_inputs = [ From 67971b2173cf3418095f19456271e76188d52efb Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Sun, 9 Feb 2025 14:31:52 +0000 Subject: [PATCH 3/4] Update params --- .../src/deploy_ai_search_indexes/ai_search.py | 6 +++--- .../src/image_processing/semantic_text_chunker.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py b/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py index 56c2e90..44be7f0 100644 --- a/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py +++ b/deploy_ai_search_indexes/src/deploy_ai_search_indexes/ai_search.py @@ -273,10 +273,10 @@ def get_mark_up_cleaner_skill(self, chunk_by_page: False) -> WebApiSkill: def get_semantic_chunker_skill( self, - num_surrounding_sentences: int = 1, + num_surrounding_sentences: int = 2, similarity_threshold: float = 0.8, - max_chunk_tokens: int = 200, - min_chunk_tokens: int = 50, + max_chunk_tokens: int = 500, + min_chunk_tokens: int = 150, ) -> SplitSkill: """Get the skill for text split. diff --git a/image_processing/src/image_processing/semantic_text_chunker.py b/image_processing/src/image_processing/semantic_text_chunker.py index e51f980..5a2c5b6 100644 --- a/image_processing/src/image_processing/semantic_text_chunker.py +++ b/image_processing/src/image_processing/semantic_text_chunker.py @@ -12,10 +12,10 @@ class SemanticTextChunker: def __init__( self, - num_surrounding_sentences: int = 1, + num_surrounding_sentences: int = 2, similarity_threshold: float = 0.8, - max_chunk_tokens: int = 200, - min_chunk_tokens: int = 50, + max_chunk_tokens: int = 500, + min_chunk_tokens: int = 150, ): self.num_surrounding_sentences = num_surrounding_sentences self.similarity_threshold = similarity_threshold From 49492b1e39d090d29735ff215e3c11801b3777c1 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Sun, 9 Feb 2025 15:18:16 +0000 Subject: [PATCH 4/4] Reduce batch --- .../src/deploy_ai_search_indexes/image_processing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deploy_ai_search_indexes/src/deploy_ai_search_indexes/image_processing.py b/deploy_ai_search_indexes/src/deploy_ai_search_indexes/image_processing.py index d5645b8..eb11fba 100644 --- a/deploy_ai_search_indexes/src/deploy_ai_search_indexes/image_processing.py +++ b/deploy_ai_search_indexes/src/deploy_ai_search_indexes/image_processing.py @@ -301,10 +301,10 @@ def get_indexer(self) -> SearchIndexer: # Only place on schedule if it is not a test deployment if self.test: schedule = None - batch_size = 4 + batch_size = 1 else: schedule = {"interval": "PT15M"} - batch_size = 16 + batch_size = 2 if self.environment.use_private_endpoint: execution_environment = IndexerExecutionEnvironment.PRIVATE