tests passing

ankitageorge · ankitageorge · commit eaac21f0f1dd · 2025-06-18T11:36:13.000-07:00
diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py
@@ -134,7 +134,6 @@ def __init__(self, cfg: DictConfig) -> None:
                 "full fp16 training is not supported with this recipe. Please use bf16 or fp32 instead."
             )
 
-        self._checkpoint_client = CheckpointClient(cfg)
         # Set up the backend for distributed training (NCCL, GLOO, etc.)
         self._enable_async_checkpointing = cfg.get("enable_async_checkpointing", False)
         self.fsdp_cpu_offload = cfg.get("fsdp_cpu_offload", False)
@@ -149,6 +148,8 @@ def __init__(self, cfg: DictConfig) -> None:
 
         self._is_rank_zero = self.rank == 0
 
+        self._checkpoint_client = CheckpointClient(cfg)
+
         # logging attributes
         self._output_dir = cfg.output_dir
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
@@ -296,7 +297,7 @@ def setup(self, cfg: DictConfig) -> None:
                         )
                     )
                 except Exception as e:
-                    log.warning(
+                    self._logger.warning(
                         f"Failed to load distributed checkpoint: {e}. Training will start from the base checkpoint."
                     )
 
diff --git a/torchtune/training/checkpointing/_checkpoint_client.py b/torchtune/training/checkpointing/_checkpoint_client.py
@@ -256,6 +256,20 @@ def _save_checkpoint_sync(
         optim_state_dict = {}
 
         if is_not_distributed_checkpointer and not single_device:
+            # this logic is needed because staging an async checkpoint needs cpu gathering
+            # which is also used here to save a sync checkpoint that causes issues when
+            # occurring concurrently. This case should never be called in theory because
+            # an epoch would be much longer than an async checkpoint. But running into this
+            # for a test case with a very fast epoch.
+            if self._get_dcp_checkpointer()._checkpoint_future is not None:
+                time_start_waiting = time.perf_counter()
+                self._get_dcp_checkpointer()._checkpoint_future.result()
+                if self._is_rank_zero:
+                    log.info(
+                        "Waiting for async checkpoint to finish, to save sync checkpoint ",
+                        f"took {time.perf_counter() - time_start_waiting:.2f} secs",
+                    )
+
             # To prevent GPU memory from spiking during checkpoint save,
             # we consolidate the full model and optim state dicts on CPU for rank 0
             model_state_dict = training.gather_cpu_state_dict(

Original file line number	Diff line number	Diff line change
`@@ -134,7 +134,6 @@ def __init__(self, cfg: DictConfig) -> None:`
`134`	`134`	`"full fp16 training is not supported with this recipe. Please use bf16 or fp32 instead."`
`135`	`135`	`)`
`136`	`136`
`137`		`- self._checkpoint_client = CheckpointClient(cfg)`
`138`	`137`	`# Set up the backend for distributed training (NCCL, GLOO, etc.)`
`139`	`138`	`self._enable_async_checkpointing = cfg.get("enable_async_checkpointing", False)`
`140`	`139`	`self.fsdp_cpu_offload = cfg.get("fsdp_cpu_offload", False)`
`@@ -149,6 +148,8 @@ def __init__(self, cfg: DictConfig) -> None:`
`149`	`148`
`150`	`149`	`self._is_rank_zero = self.rank == 0`
`151`	`150`
	`151`	`+ self._checkpoint_client = CheckpointClient(cfg)`
	`152`	`+`
`152`	`153`	`# logging attributes`
`153`	`154`	`self._output_dir = cfg.output_dir`
`154`	`155`	`self._log_every_n_steps = cfg.get("log_every_n_steps", 1)`
`@@ -296,7 +297,7 @@ def setup(self, cfg: DictConfig) -> None:`
`296`	`297`	`)`
`297`	`298`	`)`
`298`	`299`	`except Exception as e:`
`299`		`- log.warning(`
	`300`	`+ self._logger.warning(`
`300`	`301`	`f"Failed to load distributed checkpoint: {e}. Training will start from the base checkpoint."`
`301`	`302`	`)`
`302`	`303`