48
48
get_dataset_info ,
49
49
get_model ,
50
50
get_optimizer_and_scheduler ,
51
+ get_peft_branch ,
51
52
get_tokenizer ,
52
53
get_train_config ,
53
54
init_cuda ,
@@ -163,6 +164,7 @@ def train(
163
164
status = TrainStatus .FAILED
164
165
tic_train = time .perf_counter ()
165
166
eval_time = 0.0
167
+ error_msg = ""
166
168
167
169
ds_train , ds_valid , ds_test = get_train_valid_test_datasets (
168
170
tokenizer = tokenizer , query_template = query_template , print_fn = print_verbose
@@ -318,10 +320,16 @@ def train(
318
320
except KeyboardInterrupt :
319
321
print_verbose ("canceled training" )
320
322
status = TrainStatus .CANCELED
321
- except torch .OutOfMemoryError :
323
+ error_msg = "manually canceled"
324
+ except torch .OutOfMemoryError as exc :
322
325
# ouch, still let's try to log some results
323
326
print_verbose ("out of memory error encountered" )
324
327
status = TrainStatus .CANCELED
328
+ error_msg = str (exc )
329
+ except Exception as exc :
330
+ print_verbose (f"encountered an error: { exc } " )
331
+ status = TrainStatus .CANCELED
332
+ error_msg = str (exc )
325
333
326
334
toc_train = time .perf_counter ()
327
335
train_time = toc_train - tic_train - eval_time
@@ -334,6 +342,7 @@ def train(
334
342
cuda_memory_reserved_log = cuda_memory_reserved_log ,
335
343
losses = losses ,
336
344
metrics = metrics ,
345
+ error_msg = error_msg ,
337
346
)
338
347
return train_result
339
348
@@ -342,6 +351,14 @@ def main(*, path_experiment: str, experiment_name: str, clean: bool) -> None:
342
351
tic_total = time .perf_counter ()
343
352
start_date = dt .datetime .now (tz = dt .timezone .utc ).replace (microsecond = 0 ).isoformat ()
344
353
354
+ peft_branch = get_peft_branch ()
355
+ if peft_branch == "main" :
356
+ print_verbose ("===== This experiment is categorized as a MAIN run because the PEFT branch is 'main' ======" )
357
+ else :
358
+ print_verbose (
359
+ f"===== This experiment is categorized as a TEST run because the PEFT branch is '{ peft_branch } ' ======"
360
+ )
361
+
345
362
# load configs
346
363
peft_config = PeftConfig .from_pretrained (path_experiment )
347
364
path_train_config = os .path .join (path_experiment , FILE_NAME_TRAIN_PARAMS )
@@ -366,39 +383,38 @@ def main(*, path_experiment: str, experiment_name: str, clean: bool) -> None:
366
383
print_verbose (model )
367
384
368
385
# train model
369
- try :
370
- train_result = train (
371
- model = model ,
372
- max_steps = train_config .max_steps ,
373
- batch_size = train_config .batch_size ,
374
- batch_size_eval = train_config .batch_size_eval ,
375
- tokenizer = tokenizer ,
376
- cuda_memory_init = cuda_memory_init ,
377
- eval_steps = train_config .eval_steps ,
378
- generation_kwargs = train_config .generation_kwargs ,
379
- grad_norm_clip = train_config .grad_norm_clip ,
380
- optimizer_type = train_config .optimizer_type ,
381
- optimizer_kwargs = train_config .optimizer_kwargs ,
382
- query_template = train_config .query_template ,
383
- lr_scheduler_arg = train_config .lr_scheduler ,
384
- use_amp = train_config .use_amp ,
385
- is_adalora = isinstance (peft_config , AdaLoraConfig ),
386
- )
387
- except Exception as e :
388
- print_verbose (f"Training failed with error: { e } " )
389
- raise
386
+ train_result = train (
387
+ model = model ,
388
+ max_steps = train_config .max_steps ,
389
+ batch_size = train_config .batch_size ,
390
+ batch_size_eval = train_config .batch_size_eval ,
391
+ tokenizer = tokenizer ,
392
+ cuda_memory_init = cuda_memory_init ,
393
+ eval_steps = train_config .eval_steps ,
394
+ generation_kwargs = train_config .generation_kwargs ,
395
+ grad_norm_clip = train_config .grad_norm_clip ,
396
+ optimizer_type = train_config .optimizer_type ,
397
+ optimizer_kwargs = train_config .optimizer_kwargs ,
398
+ query_template = train_config .query_template ,
399
+ lr_scheduler_arg = train_config .lr_scheduler ,
400
+ use_amp = train_config .use_amp ,
401
+ is_adalora = isinstance (peft_config , AdaLoraConfig ),
402
+ )
390
403
391
404
if train_result .status == TrainStatus .FAILED :
392
405
print_verbose ("Training failed, not logging results" )
393
406
sys .exit (1 )
394
407
395
408
# save the model in temp dir, get file size, clean it up afterwards if clean is passed
396
- with tempfile .TemporaryDirectory (ignore_cleanup_errors = True , delete = clean ) as tmp_dir :
397
- model .save_pretrained (tmp_dir )
398
- stat = os .stat (os .path .join (tmp_dir , SAFETENSORS_WEIGHTS_NAME ))
399
- file_size = stat .st_size
400
- if not clean :
401
- print_verbose (f"Saved PEFT checkpoint to { tmp_dir } " )
409
+ try :
410
+ with tempfile .TemporaryDirectory (ignore_cleanup_errors = True , delete = clean ) as tmp_dir :
411
+ model .save_pretrained (tmp_dir )
412
+ stat = os .stat (os .path .join (tmp_dir , SAFETENSORS_WEIGHTS_NAME ))
413
+ file_size = stat .st_size
414
+ if not clean :
415
+ print_verbose (f"Saved PEFT checkpoint to { tmp_dir } " )
416
+ except Exception as exc :
417
+ print (f"Failed to save PEFT checkpoint due to the following error: { exc } " )
402
418
403
419
time_total = time .perf_counter () - tic_total
404
420
# log results: print and save to file
0 commit comments