ENH: Method comparison improve logging (#2591)

BenjaminBossan · web-flow · commit d6dbbc919576 · 2025-06-17T12:14:56.000+02:00
- Print early how the experiment is categorized
- Last resort save_dir so that results are not lost
- Catch errors in general, not only OOM
- Log error message
- Catch checkpoint saving in try ... except, just in case (otherwise,
  if it fails, no logs are written)
diff --git a/method_comparison/MetaMathQA/run.py b/method_comparison/MetaMathQA/run.py
@@ -48,6 +48,7 @@
     get_dataset_info,
     get_model,
     get_optimizer_and_scheduler,
+    get_peft_branch,
     get_tokenizer,
     get_train_config,
     init_cuda,
@@ -163,6 +164,7 @@ def train(
     status = TrainStatus.FAILED
     tic_train = time.perf_counter()
     eval_time = 0.0
+    error_msg = ""
 
     ds_train, ds_valid, ds_test = get_train_valid_test_datasets(
         tokenizer=tokenizer, query_template=query_template, print_fn=print_verbose
@@ -318,10 +320,16 @@ def train(
     except KeyboardInterrupt:
         print_verbose("canceled training")
         status = TrainStatus.CANCELED
-    except torch.OutOfMemoryError:
+        error_msg = "manually canceled"
+    except torch.OutOfMemoryError as exc:
         # ouch, still let's try to log some results
         print_verbose("out of memory error encountered")
         status = TrainStatus.CANCELED
+        error_msg = str(exc)
+    except Exception as exc:
+        print_verbose(f"encountered an error: {exc}")
+        status = TrainStatus.CANCELED
+        error_msg = str(exc)
 
     toc_train = time.perf_counter()
     train_time = toc_train - tic_train - eval_time
@@ -334,6 +342,7 @@ def train(
         cuda_memory_reserved_log=cuda_memory_reserved_log,
         losses=losses,
         metrics=metrics,
+        error_msg=error_msg,
     )
     return train_result
 
@@ -342,6 +351,14 @@ def main(*, path_experiment: str, experiment_name: str, clean: bool) -> None:
     tic_total = time.perf_counter()
     start_date = dt.datetime.now(tz=dt.timezone.utc).replace(microsecond=0).isoformat()
 
+    peft_branch = get_peft_branch()
+    if peft_branch == "main":
+        print_verbose("===== This experiment is categorized as a MAIN run because the PEFT branch is 'main' ======")
+    else:
+        print_verbose(
+            f"===== This experiment is categorized as a TEST run because the PEFT branch is '{peft_branch}' ======"
+        )
+
     # load configs
     peft_config = PeftConfig.from_pretrained(path_experiment)
     path_train_config = os.path.join(path_experiment, FILE_NAME_TRAIN_PARAMS)
@@ -366,39 +383,38 @@ def main(*, path_experiment: str, experiment_name: str, clean: bool) -> None:
     print_verbose(model)
 
     # train model
-    try:
-        train_result = train(
-            model=model,
-            max_steps=train_config.max_steps,
-            batch_size=train_config.batch_size,
-            batch_size_eval=train_config.batch_size_eval,
-            tokenizer=tokenizer,
-            cuda_memory_init=cuda_memory_init,
-            eval_steps=train_config.eval_steps,
-            generation_kwargs=train_config.generation_kwargs,
-            grad_norm_clip=train_config.grad_norm_clip,
-            optimizer_type=train_config.optimizer_type,
-            optimizer_kwargs=train_config.optimizer_kwargs,
-            query_template=train_config.query_template,
-            lr_scheduler_arg=train_config.lr_scheduler,
-            use_amp=train_config.use_amp,
-            is_adalora=isinstance(peft_config, AdaLoraConfig),
-        )
-    except Exception as e:
-        print_verbose(f"Training failed with error: {e}")
-        raise
+    train_result = train(
+        model=model,
+        max_steps=train_config.max_steps,
+        batch_size=train_config.batch_size,
+        batch_size_eval=train_config.batch_size_eval,
+        tokenizer=tokenizer,
+        cuda_memory_init=cuda_memory_init,
+        eval_steps=train_config.eval_steps,
+        generation_kwargs=train_config.generation_kwargs,
+        grad_norm_clip=train_config.grad_norm_clip,
+        optimizer_type=train_config.optimizer_type,
+        optimizer_kwargs=train_config.optimizer_kwargs,
+        query_template=train_config.query_template,
+        lr_scheduler_arg=train_config.lr_scheduler,
+        use_amp=train_config.use_amp,
+        is_adalora=isinstance(peft_config, AdaLoraConfig),
+    )
 
     if train_result.status == TrainStatus.FAILED:
         print_verbose("Training failed, not logging results")
         sys.exit(1)
 
     # save the model in temp dir, get file size, clean it up afterwards if clean is passed
-    with tempfile.TemporaryDirectory(ignore_cleanup_errors=True, delete=clean) as tmp_dir:
-        model.save_pretrained(tmp_dir)
-        stat = os.stat(os.path.join(tmp_dir, SAFETENSORS_WEIGHTS_NAME))
-        file_size = stat.st_size
-        if not clean:
-            print_verbose(f"Saved PEFT checkpoint to {tmp_dir}")
+    try:
+        with tempfile.TemporaryDirectory(ignore_cleanup_errors=True, delete=clean) as tmp_dir:
+            model.save_pretrained(tmp_dir)
+            stat = os.stat(os.path.join(tmp_dir, SAFETENSORS_WEIGHTS_NAME))
+            file_size = stat.st_size
+            if not clean:
+                print_verbose(f"Saved PEFT checkpoint to {tmp_dir}")
+    except Exception as exc:
+        print(f"Failed to save PEFT checkpoint due to the following error: {exc}")
 
     time_total = time.perf_counter() - tic_total
     # log results: print and save to file
diff --git a/method_comparison/MetaMathQA/utils.py b/method_comparison/MetaMathQA/utils.py
@@ -21,6 +21,7 @@
 import os
 import platform
 import subprocess
+import tempfile
 import warnings
 from dataclasses import asdict, dataclass
 from decimal import Decimal, DivisionByZero, InvalidOperation
@@ -545,6 +546,7 @@ class TrainResult:
     cuda_memory_reserved_log: list[int]
     losses: list[float]
     metrics: list[Any]  # TODO
+    error_msg: str
 
 
 def log_to_console(log_data: dict[str, Any], print_fn: Callable[..., None]) -> None:
@@ -621,6 +623,9 @@ def log_results(
     elif train_result.status == TrainStatus.SUCCESS:
         save_dir = RESULT_PATH
         print_fn("Experiment run was categorized as successful run")
+    else:
+        save_dir = tempfile.mkdtemp()
+        print_fn(f"Experiment could not be categorized, writing results to {save_dir}. Please open an issue on PEFT.")
 
     peft_config_dict = peft_config.to_dict()
     for key, value in peft_config_dict.items():
@@ -635,6 +640,7 @@ def log_results(
             "peft_branch": peft_branch,
             "train_config": asdict(train_config),
             "peft_config": peft_config_dict,
+            "error_msg": train_result.error_msg,
         },
         "train_info": {
             "cuda_memory_reserved_avg": cuda_memory_avg,