default decoupled weight decay to false

lucidrains · lucidrains · commit 665ec9619685 · 2024-07-31T09:12:56.000-07:00
diff --git a/adam_atan2_pytorch/adam_atan2.py b/adam_atan2_pytorch/adam_atan2.py
@@ -19,6 +19,7 @@ def __init__(
         lr = 1e-4,
         betas: Tuple[float, float] = (0.9, 0.99),
         weight_decay = 0.,
+        decoupled_wd = False,
         a = 1.27,
         b = 1.
     ):
@@ -27,6 +28,7 @@ def __init__(
         assert weight_decay >= 0.
 
         self._init_lr = lr
+        self.decoupled_wd = decoupled_wd
 
         defaults = dict(
             lr = lr,
@@ -54,7 +56,12 @@ def step(
 
                 grad, lr, wd, beta1, beta2, a, b, state, init_lr = p.grad, group['lr'], group['weight_decay'], *group['betas'], group['a'], group['b'], self.state[p], self._init_lr
 
-                # decoupled weight decay
+                # maybe decoupled weight decay
+
+                if self.decoupled_wd:
+                    wd /= init_lr
+
+                # weight decay
 
                 if wd > 0.:
                     p.mul_(1. - lr / init_lr * wd)
diff --git a/adam_atan2_pytorch/foreach.py b/adam_atan2_pytorch/foreach.py
@@ -31,6 +31,7 @@ def __init__(
         lr = 1e-4,
         betas: Tuple[float, float] = (0.9, 0.99),
         weight_decay = 0.,
+        decoupled_wd = False,
         a = 1.27,
         b = 1.,
         foreach_atan2_fn: Callable | None = None
@@ -41,6 +42,7 @@ def __init__(
         assert all([hasattr(torch, f'_foreach_{attr}_') for attr in ('mul', 'add', 'lerp', 'sqrt')]), 'this version of torch does not have the prerequisite foreach functions'
 
         self._init_lr = lr
+        self.decoupled_wd = decoupled_wd
 
         self._foreach_atan2_ = default(
             foreach_atan2_fn,
@@ -74,6 +76,8 @@ def step(
 
             wd, lr, beta1, beta2, a, b = group['weight_decay'], group['lr'], *group['betas'], group['a'], group['b']
 
+            has_weight_decay = wd > 0
+
             # accumulate List[Tensor] for foreach inplace updates
 
             params = []
@@ -86,9 +90,9 @@ def step(
 
                 grad, state = p.grad, self.state[p]
 
-                # decoupled weight decay
+                # maybe decoupled weight decay
 
-                if wd > 0.:
+                if self.decoupled_wd and has_weight_decay:
                     wd /= init_lr
 
                 # init state if needed
@@ -123,7 +127,7 @@ def step(
 
             # weight decay
 
-            if wd > 0.:
+            if has_weight_decay:
                 torch._foreach_mul_(params, 1. - lr * wd)
 
             # decay running averages
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "adam-atan2-pytorch"
-version = "0.0.9"
+version = "0.0.10"
 description = "Adam-atan2 for Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }