Skip to content

Commit d6dbbc9

Browse files
ENH: Method comparison improve logging (#2591)
- Print early how the experiment is categorized - Last resort save_dir so that results are not lost - Catch errors in general, not only OOM - Log error message - Catch checkpoint saving in try ... except, just in case (otherwise, if it fails, no logs are written)
1 parent a27406c commit d6dbbc9

File tree

2 files changed

+50
-28
lines changed

2 files changed

+50
-28
lines changed

method_comparison/MetaMathQA/run.py

Lines changed: 44 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
get_dataset_info,
4949
get_model,
5050
get_optimizer_and_scheduler,
51+
get_peft_branch,
5152
get_tokenizer,
5253
get_train_config,
5354
init_cuda,
@@ -163,6 +164,7 @@ def train(
163164
status = TrainStatus.FAILED
164165
tic_train = time.perf_counter()
165166
eval_time = 0.0
167+
error_msg = ""
166168

167169
ds_train, ds_valid, ds_test = get_train_valid_test_datasets(
168170
tokenizer=tokenizer, query_template=query_template, print_fn=print_verbose
@@ -318,10 +320,16 @@ def train(
318320
except KeyboardInterrupt:
319321
print_verbose("canceled training")
320322
status = TrainStatus.CANCELED
321-
except torch.OutOfMemoryError:
323+
error_msg = "manually canceled"
324+
except torch.OutOfMemoryError as exc:
322325
# ouch, still let's try to log some results
323326
print_verbose("out of memory error encountered")
324327
status = TrainStatus.CANCELED
328+
error_msg = str(exc)
329+
except Exception as exc:
330+
print_verbose(f"encountered an error: {exc}")
331+
status = TrainStatus.CANCELED
332+
error_msg = str(exc)
325333

326334
toc_train = time.perf_counter()
327335
train_time = toc_train - tic_train - eval_time
@@ -334,6 +342,7 @@ def train(
334342
cuda_memory_reserved_log=cuda_memory_reserved_log,
335343
losses=losses,
336344
metrics=metrics,
345+
error_msg=error_msg,
337346
)
338347
return train_result
339348

@@ -342,6 +351,14 @@ def main(*, path_experiment: str, experiment_name: str, clean: bool) -> None:
342351
tic_total = time.perf_counter()
343352
start_date = dt.datetime.now(tz=dt.timezone.utc).replace(microsecond=0).isoformat()
344353

354+
peft_branch = get_peft_branch()
355+
if peft_branch == "main":
356+
print_verbose("===== This experiment is categorized as a MAIN run because the PEFT branch is 'main' ======")
357+
else:
358+
print_verbose(
359+
f"===== This experiment is categorized as a TEST run because the PEFT branch is '{peft_branch}' ======"
360+
)
361+
345362
# load configs
346363
peft_config = PeftConfig.from_pretrained(path_experiment)
347364
path_train_config = os.path.join(path_experiment, FILE_NAME_TRAIN_PARAMS)
@@ -366,39 +383,38 @@ def main(*, path_experiment: str, experiment_name: str, clean: bool) -> None:
366383
print_verbose(model)
367384

368385
# train model
369-
try:
370-
train_result = train(
371-
model=model,
372-
max_steps=train_config.max_steps,
373-
batch_size=train_config.batch_size,
374-
batch_size_eval=train_config.batch_size_eval,
375-
tokenizer=tokenizer,
376-
cuda_memory_init=cuda_memory_init,
377-
eval_steps=train_config.eval_steps,
378-
generation_kwargs=train_config.generation_kwargs,
379-
grad_norm_clip=train_config.grad_norm_clip,
380-
optimizer_type=train_config.optimizer_type,
381-
optimizer_kwargs=train_config.optimizer_kwargs,
382-
query_template=train_config.query_template,
383-
lr_scheduler_arg=train_config.lr_scheduler,
384-
use_amp=train_config.use_amp,
385-
is_adalora=isinstance(peft_config, AdaLoraConfig),
386-
)
387-
except Exception as e:
388-
print_verbose(f"Training failed with error: {e}")
389-
raise
386+
train_result = train(
387+
model=model,
388+
max_steps=train_config.max_steps,
389+
batch_size=train_config.batch_size,
390+
batch_size_eval=train_config.batch_size_eval,
391+
tokenizer=tokenizer,
392+
cuda_memory_init=cuda_memory_init,
393+
eval_steps=train_config.eval_steps,
394+
generation_kwargs=train_config.generation_kwargs,
395+
grad_norm_clip=train_config.grad_norm_clip,
396+
optimizer_type=train_config.optimizer_type,
397+
optimizer_kwargs=train_config.optimizer_kwargs,
398+
query_template=train_config.query_template,
399+
lr_scheduler_arg=train_config.lr_scheduler,
400+
use_amp=train_config.use_amp,
401+
is_adalora=isinstance(peft_config, AdaLoraConfig),
402+
)
390403

391404
if train_result.status == TrainStatus.FAILED:
392405
print_verbose("Training failed, not logging results")
393406
sys.exit(1)
394407

395408
# save the model in temp dir, get file size, clean it up afterwards if clean is passed
396-
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True, delete=clean) as tmp_dir:
397-
model.save_pretrained(tmp_dir)
398-
stat = os.stat(os.path.join(tmp_dir, SAFETENSORS_WEIGHTS_NAME))
399-
file_size = stat.st_size
400-
if not clean:
401-
print_verbose(f"Saved PEFT checkpoint to {tmp_dir}")
409+
try:
410+
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True, delete=clean) as tmp_dir:
411+
model.save_pretrained(tmp_dir)
412+
stat = os.stat(os.path.join(tmp_dir, SAFETENSORS_WEIGHTS_NAME))
413+
file_size = stat.st_size
414+
if not clean:
415+
print_verbose(f"Saved PEFT checkpoint to {tmp_dir}")
416+
except Exception as exc:
417+
print(f"Failed to save PEFT checkpoint due to the following error: {exc}")
402418

403419
time_total = time.perf_counter() - tic_total
404420
# log results: print and save to file

method_comparison/MetaMathQA/utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import os
2222
import platform
2323
import subprocess
24+
import tempfile
2425
import warnings
2526
from dataclasses import asdict, dataclass
2627
from decimal import Decimal, DivisionByZero, InvalidOperation
@@ -545,6 +546,7 @@ class TrainResult:
545546
cuda_memory_reserved_log: list[int]
546547
losses: list[float]
547548
metrics: list[Any] # TODO
549+
error_msg: str
548550

549551

550552
def log_to_console(log_data: dict[str, Any], print_fn: Callable[..., None]) -> None:
@@ -621,6 +623,9 @@ def log_results(
621623
elif train_result.status == TrainStatus.SUCCESS:
622624
save_dir = RESULT_PATH
623625
print_fn("Experiment run was categorized as successful run")
626+
else:
627+
save_dir = tempfile.mkdtemp()
628+
print_fn(f"Experiment could not be categorized, writing results to {save_dir}. Please open an issue on PEFT.")
624629

625630
peft_config_dict = peft_config.to_dict()
626631
for key, value in peft_config_dict.items():
@@ -635,6 +640,7 @@ def log_results(
635640
"peft_branch": peft_branch,
636641
"train_config": asdict(train_config),
637642
"peft_config": peft_config_dict,
643+
"error_msg": train_result.error_msg,
638644
},
639645
"train_info": {
640646
"cuda_memory_reserved_avg": cuda_memory_avg,

0 commit comments

Comments
 (0)