trust @jfpuget and default example to using parametrize

lucidrains · lucidrains · commit 61e484b54cd0 · 2024-10-28T13:19:46.000-07:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "nGPT-pytorch"
-version = "0.1.17"
+version = "0.1.18"
 description = "nGPT"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/train.py b/train.py
@@ -25,10 +25,14 @@
 GENERATE_EVERY = 500
 GENERATE_LENGTH = 512
 SEQ_LEN = 512
+
 USE_AMP = True
+USE_PARAMETRIZE = True # whether to manually update weights after each optimizer step
 
 device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 
+assert not (USE_AMP and not torch.cuda.is_available())
+
 # helpers
 
 def exists(v):
@@ -94,8 +98,8 @@ def base_decoding(
     num_tokens = 256,
     dim = 512,
     depth = 8,
-    manual_norm_weights = True,
-    tied_embedding = True
+    tied_embedding = True,
+    manual_norm_weights = not USE_PARAMETRIZE
 ).to(device)
 
 scaler = GradScaler(enabled = USE_AMP)
@@ -153,7 +157,8 @@ def __getitem__(self, index):
 
     optim.zero_grad()
 
-    model.norm_weights_()
+    if not USE_PARAMETRIZE:
+        model.norm_weights_()
 
     if i % VALIDATE_EVERY == 0:
         model.eval()