1515from sentence_transformers import SentenceTransformer
1616from sidekick .logger import logger
1717from sklearn .metrics .pairwise import cosine_similarity
18- from transformers import (AutoConfig , AutoModelForCausalLM , AutoTokenizer ,
19- BitsAndBytesConfig )
18+ from transformers import AutoConfig , AutoModelForCausalLM , AutoTokenizer , BitsAndBytesConfig
2019
2120
2221def generate_sentence_embeddings (model_path : str , x , batch_size : int = 32 , device : Optional [str ] = None ):
@@ -269,9 +268,9 @@ def get_table_keys(file_path: str, table_key: str):
269268 return res , data
270269
271270
272- def is_resource_low ():
273- free_in_GB = int (torch .cuda .mem_get_info ()[0 ] / 1024 ** 3 )
274- total_memory = int (torch .cuda .get_device_properties (0 ).total_memory / 1024 ** 3 )
271+ def is_resource_low (device_index : int = 0 ):
272+ free_in_GB = int (torch .cuda .mem_get_info (device_index )[0 ] / 1024 ** 3 )
273+ total_memory = int (torch .cuda .get_device_properties (device_index ).total_memory / 1024 ** 3 )
275274 logger .info (f"Total Memory: { total_memory } GB" )
276275 logger .info (f"Free GPU memory: { free_in_GB } GB" )
277276 off_load = True
@@ -296,20 +295,21 @@ def load_causal_lm_model(
296295 }
297296 model_name = model_choices_map [model_type ]
298297 logger .info (f"Loading model: { model_name } " )
298+ device_index = 0
299299 # Load h2oGPT.SQL model
300- device = {"" : 0 } if torch .cuda .is_available () else "cpu" if device == "auto" else device
300+ device = {"" : device_index } if torch .cuda .is_available () else "cpu" if device == "auto" else device
301301 total_memory = int (torch .cuda .get_device_properties (0 ).total_memory / 1024 ** 3 )
302302 free_in_GB = int (torch .cuda .mem_get_info ()[0 ] / 1024 ** 3 )
303303 logger .info (f"Free GPU memory: { free_in_GB } GB" )
304304 n_gpus = torch .cuda .device_count ()
305+ logger .info (f"Total GPUs: { n_gpus } " )
305306 _load_in_8bit = load_in_8bit
306307
307308 # 22GB (Least requirement on GPU) is a magic number for the current model size.
308309 if off_load and re_generate and total_memory < 22 :
309310 # To prevent the system from crashing in-case memory runs low.
310311 # TODO: Performance when offloading to CPU.
311- max_memory = f"{ 4 } GB"
312- max_memory = {i : max_memory for i in range (n_gpus )}
312+ max_memory = {device_index : f"{ 4 } GB" }
313313 logger .info (f"Max Memory: { max_memory } , offloading to CPU" )
314314 with init_empty_weights ():
315315 config = AutoConfig .from_pretrained (model_name , cache_dir = cache_path , offload_folder = cache_path )
@@ -322,8 +322,7 @@ def load_causal_lm_model(
322322 _load_in_8bit = True
323323 load_in_4bit = False
324324 else :
325- max_memory = f"{ int (free_in_GB )- 2 } GB"
326- max_memory = {i : max_memory for i in range (n_gpus )}
325+ max_memory = {device_index : f"{ int (free_in_GB )- 2 } GB" }
327326 _offload_state_dict = False
328327 _llm_int8_enable_fp32_cpu_offload = False
329328
0 commit comments