FIX Avoid CUDA Graph re-record with hotswap (#2611)

keepdying · web-flow · commit bbc9f5dc8b9c · 2025-06-27T11:33:09.000+02:00
diff --git a/src/peft/utils/hotswap.py b/src/peft/utils/hotswap.py
@@ -480,7 +480,7 @@ def hotswap_adapter_from_state_dict(
             # either
             # - adapters had the same rank
             # - adapters were padded with prepare_model_for_compiled_hotswap and 2nd adapter was larger
-            old_val.data = new_val.data
+            old_val.data.copy_(new_val.data)
         else:
             # if 2nd adapter was smaller, ensure to fill up to adapter dimension and set the rest to zeros
             if old_val.dim() not in (2, 4):
@@ -492,10 +492,10 @@ def hotswap_adapter_from_state_dict(
             # Linear or Conv2d: the check for dim 0 or 1 works for both of these layer types
             if old_val.shape[0] > new_val.shape[0]:
                 old_val.data.fill_(0)
-                old_val.data[: new_val.shape[0]] = new_val.data
+                old_val.data[: new_val.shape[0]].copy_(new_val.data)
             elif old_val.shape[1] > new_val.shape[1]:
                 old_val.data.fill_(0)
-                old_val.data[:, : new_val.shape[1]] = new_val.data
+                old_val.data[:, : new_val.shape[1]].copy_(new_val.data)
             else:
                 raise ValueError(
                     f"Incompatible shapes found for LoRA weights {key}: {old_val.shape} vs {new_val.shape}. Please "
diff --git a/tests/test_gpu_examples.py b/tests/test_gpu_examples.py
@@ -4683,10 +4683,22 @@ def check_hotswap(self, do_hotswap, ranks, alpha_scalings):
             output_after1 = model(inputs).logits
             assert torch.allclose(output1, output_after1, atol=tol, rtol=tol)
 
+            # we need to call forward third time since cudagraphs are not recorded in first call.
+            if do_hotswap:
+                hotswap_adapter(model, os.path.join(tmp_dirname, "adapter0"), adapter_name="default")
+                output_after2 = model(inputs).logits
+                assert torch.allclose(output0, output_after2, atol=tol, rtol=tol)
+
     # it is important to check hotswapping small to large ranks and large to small ranks
     @pytest.mark.parametrize("ranks", [(11, 11), (7, 13), (13, 7)])
     def test_hotswapping_compiled_model_does_not_trigger_recompilation(self, ranks):
-        with torch._dynamo.config.patch(error_on_recompile=True):  # raise an error on recompilation
+        # here we set three configs to ensure no recompilation or cudagraph re-record occurs:
+        # 1. error_on_recompile: raise an error on recompilation
+        # 2. inline_inbuilt_nn_modules: needed to raise an error on static input address changes instead of re-recording
+        # 3. triton.cudagraph_support_input_mutation: same as above
+        dynamo_config_ctx = torch._dynamo.config.patch(error_on_recompile=True, inline_inbuilt_nn_modules=False)
+        inductor_config_ctx = torch._inductor.config.patch("triton.cudagraph_support_input_mutation", False)
+        with dynamo_config_ctx, inductor_config_ctx:
             self.check_hotswap(do_hotswap=True, ranks=ranks, alpha_scalings=ranks)
 
     def test_no_hotswapping_compiled_model_triggers_recompilation(self):