go for an improvised solution for what may be an issue with autoregressive and laser

lucidrains · lucidrains · commit 43f1b75331b4 · 2024-12-03T05:35:34.000-08:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "transfusion-pytorch"
-version = "0.6.3"
+version = "0.6.4"
 description = "Transfusion in Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/train_latent_with_text.py b/train_latent_with_text.py
@@ -95,7 +95,7 @@ def encode_tokens(str: str) -> Tensor:
         dim = 128,
         depth = 8,
         dim_head = 64,
-        heads = 8
+        heads = 8,
     )
 ).cuda()
 
diff --git a/train_mnist.py b/train_mnist.py
@@ -65,7 +65,7 @@ def forward(self, x):
         dim = 64,
         depth = 4,
         dim_head = 32,
-        heads = 8
+        heads = 8,
     )
 ).cuda()
 
diff --git a/train_mnist_vae.py b/train_mnist_vae.py
@@ -127,7 +127,7 @@ def forward(self, x):
         dim = 64,
         depth = 4,
         dim_head = 32,
-        heads = 8
+        heads = 8,
     )
 ).cuda()
 
diff --git a/train_mnist_with_unet.py b/train_mnist_with_unet.py
@@ -61,7 +61,7 @@ def forward(self, x):
         dim = 64,
         depth = 4,
         dim_head = 32,
-        heads = 8
+        heads = 8,
     )
 ).to(device)
 
diff --git a/transfusion_pytorch/transfusion.py b/transfusion_pytorch/transfusion.py
@@ -757,6 +757,7 @@ def __init__(
         use_flex_attn = False,
         gate_values = True,
         laser = False,
+        laser_softclamp_value = 15.,
         learned_value_residual_mix = False
     ):
         super().__init__()
@@ -785,6 +786,7 @@ def __init__(
         self.softcap_value = softcap_value
 
         self.laser = laser
+        self.laser_softclamp_value = laser_softclamp_value
 
         self.dropout = nn.Dropout(dropout)
 
@@ -850,8 +852,8 @@ def forward(
         # laser attention
 
         if self.laser:
-            v_max = v.amax(dim = -2, keepdim = True).detach()
-            v = (v - v_max).exp()
+            v = softclamp(v, self.laser_softclamp_value)
+            v = v.exp()
 
         # whether to use flex attention or not
 
@@ -890,7 +892,7 @@ def forward(
         # laser attention
 
         if self.laser:
-            out = log(out) + v_max
+            out = log(out)
 
         # maybe gate values
 

Original file line number	Diff line number	Diff line change
`@@ -95,7 +95,7 @@ def encode_tokens(str: str) -> Tensor:`
`95`	`95`	`dim = 128,`
`96`	`96`	`depth = 8,`
`97`	`97`	`dim_head = 64,`
`98`		`- heads = 8`
	`98`	`+ heads = 8,`
`99`	`99`	`)`
`100`	`100`	`).cuda()`
`101`	`101`
Original file line number	Diff line number	Diff line change
`@@ -65,7 +65,7 @@ def forward(self, x):`
`65`	`65`	`dim = 64,`
`66`	`66`	`depth = 4,`
`67`	`67`	`dim_head = 32,`
`68`		`- heads = 8`
	`68`	`+ heads = 8,`
`69`	`69`	`)`
`70`	`70`	`).cuda()`
`71`	`71`
Original file line number	Diff line number	Diff line change
`@@ -127,7 +127,7 @@ def forward(self, x):`
`127`	`127`	`dim = 64,`
`128`	`128`	`depth = 4,`
`129`	`129`	`dim_head = 32,`
`130`		`- heads = 8`
	`130`	`+ heads = 8,`
`131`	`131`	`)`
`132`	`132`	`).cuda()`
`133`	`133`
Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,7 @@ def forward(self, x):`
`61`	`61`	`dim = 64,`
`62`	`62`	`depth = 4,`
`63`	`63`	`dim_head = 32,`
`64`		`- heads = 8`
	`64`	`+ heads = 8,`
`65`	`65`	`)`
`66`	`66`	`).to(device)`
`67`	`67`