15
15
from sentence_transformers import SentenceTransformer
16
16
from sidekick .logger import logger
17
17
from sklearn .metrics .pairwise import cosine_similarity
18
- from transformers import (AutoConfig , AutoModelForCausalLM , AutoTokenizer ,
19
- BitsAndBytesConfig )
18
+ from transformers import AutoConfig , AutoModelForCausalLM , AutoTokenizer , BitsAndBytesConfig
20
19
21
20
22
21
def generate_sentence_embeddings (model_path : str , x , batch_size : int = 32 , device : Optional [str ] = None ):
@@ -269,9 +268,9 @@ def get_table_keys(file_path: str, table_key: str):
269
268
return res , data
270
269
271
270
272
- def is_resource_low ():
273
- free_in_GB = int (torch .cuda .mem_get_info ()[0 ] / 1024 ** 3 )
274
- total_memory = int (torch .cuda .get_device_properties (0 ).total_memory / 1024 ** 3 )
271
+ def is_resource_low (device_index : int = 0 ):
272
+ free_in_GB = int (torch .cuda .mem_get_info (device_index )[0 ] / 1024 ** 3 )
273
+ total_memory = int (torch .cuda .get_device_properties (device_index ).total_memory / 1024 ** 3 )
275
274
logger .info (f"Total Memory: { total_memory } GB" )
276
275
logger .info (f"Free GPU memory: { free_in_GB } GB" )
277
276
off_load = True
@@ -296,20 +295,21 @@ def load_causal_lm_model(
296
295
}
297
296
model_name = model_choices_map [model_type ]
298
297
logger .info (f"Loading model: { model_name } " )
298
+ device_index = 0
299
299
# Load h2oGPT.SQL model
300
- device = {"" : 0 } if torch .cuda .is_available () else "cpu" if device == "auto" else device
300
+ device = {"" : device_index } if torch .cuda .is_available () else "cpu" if device == "auto" else device
301
301
total_memory = int (torch .cuda .get_device_properties (0 ).total_memory / 1024 ** 3 )
302
302
free_in_GB = int (torch .cuda .mem_get_info ()[0 ] / 1024 ** 3 )
303
303
logger .info (f"Free GPU memory: { free_in_GB } GB" )
304
304
n_gpus = torch .cuda .device_count ()
305
+ logger .info (f"Total GPUs: { n_gpus } " )
305
306
_load_in_8bit = load_in_8bit
306
307
307
308
# 22GB (Least requirement on GPU) is a magic number for the current model size.
308
309
if off_load and re_generate and total_memory < 22 :
309
310
# To prevent the system from crashing in-case memory runs low.
310
311
# TODO: Performance when offloading to CPU.
311
- max_memory = f"{ 4 } GB"
312
- max_memory = {i : max_memory for i in range (n_gpus )}
312
+ max_memory = {device_index : f"{ 4 } GB" }
313
313
logger .info (f"Max Memory: { max_memory } , offloading to CPU" )
314
314
with init_empty_weights ():
315
315
config = AutoConfig .from_pretrained (model_name , cache_dir = cache_path , offload_folder = cache_path )
@@ -322,8 +322,7 @@ def load_causal_lm_model(
322
322
_load_in_8bit = True
323
323
load_in_4bit = False
324
324
else :
325
- max_memory = f"{ int (free_in_GB )- 2 } GB"
326
- max_memory = {i : max_memory for i in range (n_gpus )}
325
+ max_memory = {device_index : f"{ int (free_in_GB )- 2 } GB" }
327
326
_offload_state_dict = False
328
327
_llm_int8_enable_fp32_cpu_offload = False
329
328
0 commit comments