add hyper connections proposed by bytedance ai labs

lucidrains · lucidrains · commit 74c7ecdbe75b · 2024-12-23T09:04:17.000-08:00
diff --git a/README.md b/README.md
@@ -239,5 +239,15 @@ sampled = model.generate_text_only(text[:, :1], 1024)
     author  = {Xin Dong and Y. Fu and Shizhe Diao and Wonmin Byeon and Zijia Chen and Ameya Mahabaleshwarkar and Shih-Yang Liu and Matthijs Van Keirsbilck and Min-Hung Chen and Yoshi Suhara and Yingyan Lin and Jan Kautz and Pavlo Molchanov},
     year    = {2024},
     url     = {https://api.semanticscholar.org/CorpusID:274166163}
+```
+
+```bibtex
+@article{Zhu2024HyperConnections,
+    title   = {Hyper-Connections},
+    author  = {Defa Zhu and Hongzhi Huang and Zihao Huang and Yutao Zeng and Yunyao Mao and Banggu Wu and Qiyang Min and Xun Zhou},
+    journal = {ArXiv},
+    year    = {2024},
+    volume  = {abs/2409.19606},
+    url     = {https://api.semanticscholar.org/CorpusID:272987528}
 }
 ```
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "transfusion-pytorch"
-version = "0.6.7"
+version = "0.7.0"
 description = "Transfusion in Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/tests/test_transfusion.py b/tests/test_transfusion.py
@@ -20,10 +20,12 @@
 
 @pytest.mark.parametrize('cache_kv', (False, True))
 @pytest.mark.parametrize('use_flex_attn', (False, True))
+@pytest.mark.parametrize('num_residual_streams', (1, 4))
 @pytest.mark.parametrize('reconstruction_loss_weight', (0., 0.1))
 def test_transfusion(
     cache_kv: bool,
     use_flex_attn: bool,
+    num_residual_streams: int,
     reconstruction_loss_weight: float
 ):
 
@@ -41,7 +43,8 @@ def test_transfusion(
         transformer = dict(
             dim = 64,
             depth = 2,
-            use_flex_attn = use_flex_attn
+            use_flex_attn = use_flex_attn,
+            num_residual_streams = num_residual_streams
         )
     )
 
diff --git a/transfusion_pytorch/transfusion.py b/transfusion_pytorch/transfusion.py
@@ -17,6 +17,8 @@
 import math
 from collections import defaultdict
 
+from random import randrange
+from itertools import count
 from functools import partial, wraps, cache
 from typing import NamedTuple, Callable, Literal
 
@@ -591,6 +593,97 @@ def forward(
         fourier_embed, _ = pack((times, freqs.sin(), freqs.cos()), 'b n *')
         return fourier_embed
 
+# hyper connections - multiple residual streams
+
+class Residual(Module):
+    def __init__(self, **kwargs):
+        super().__init__()
+
+    def prepare_with_inverse(self, residuals):
+        branch_input, residuals, residual_kwargs = self.prepare(residuals)
+
+        def inverse(branch_out):
+            return self(branch_out, residuals, **residual_kwargs)
+
+        return branch_input, inverse
+
+    def prepare(self, residuals):
+        return residuals, residuals, dict()
+
+    def forward(self, branch_out, residuals, **kwargs):
+        return branch_out + residuals
+
+class HyperConnections(Module):
+    def __init__(
+        self,
+        dim,
+        *,
+        num_residual_streams,
+        layer_index = None,
+        tanh = True,
+        **kwargs
+    ):
+        """
+        https://arxiv.org/abs/2409.19606
+        Appendix J - Algorithm 2, Dynamic only
+        """
+        super().__init__()
+
+        self.act = nn.Tanh() if tanh else nn.Identity()
+
+        self.norm = nn.RMSNorm(dim)
+
+        self.num_residual_streams = num_residual_streams
+        layer_index = default(layer_index, randrange(num_residual_streams)) # just choose one random residual stream if layer index not given
+
+        self.static_beta = nn.Parameter(torch.ones(num_residual_streams))
+
+        init_alpha0 = torch.zeros((num_residual_streams, 1))
+        init_alpha0[layer_index % num_residual_streams, 0] = 1.
+
+        self.static_alpha = nn.Parameter(torch.cat([init_alpha0, torch.eye(num_residual_streams)], dim = 1))
+
+        self.dynamic_alpha_fn = nn.Parameter(torch.zeros(dim, num_residual_streams + 1))
+        self.dynamic_alpha_scale = nn.Parameter(torch.ones(()) * 1e-2)
+        self.dynamic_beta_fn = nn.Parameter(torch.zeros(dim))
+        self.dynamic_beta_scale = nn.Parameter(torch.ones(()) * 1e-2)
+
+    def prepare_with_inverse(self, residuals):
+        branch_input, residuals, residual_kwargs = self.prepare(residuals)
+
+        def inverse(branch_out):
+            return self(branch_out, residuals, **residual_kwargs)
+
+        return branch_input, inverse
+
+    def prepare(self, residuals):
+
+        residuals = rearrange(residuals, '(b s) n d -> b n s d', s = self.num_residual_streams)
+
+        normed = self.norm(residuals)
+
+        wc_weight = self.act(normed @ self.dynamic_alpha_fn)
+        dynamic_alpha = wc_weight * self.dynamic_alpha_scale
+        alpha = dynamic_alpha + self.static_alpha
+
+        dc_weight = self.act(normed @ self.dynamic_beta_fn)
+        dynamic_beta = dc_weight * self.dynamic_beta_scale
+        beta = dynamic_beta + self.static_beta
+
+        # width connection
+
+        mix_h = einsum(alpha, residuals, '... s t, ... s d -> ... t d')
+
+        branch_input, residuals = mix_h[..., 0, :], mix_h[..., 1:, :]
+
+        return branch_input, residuals, dict(beta = beta)
+
+    def forward(self, branch_output, residuals, *, beta):
+        # 'depth' connection
+
+        residuals = einsum(branch_output, beta, 'b n d, b n s -> b n s d') + residuals
+        return rearrange(residuals, 'b n s d -> (b s) n d')
+
 # adaptive layernorm and ada-ln zero rolled into one wrapper
 # from DiT paper and sota for time conditioning for now
 
@@ -940,7 +1033,8 @@ def __init__(
         ff_kwargs: dict = dict(),
         attn_laser = False,
         unet_skips = True,
-        use_flex_attn = False
+        use_flex_attn = False,
+        num_residual_streams = 1
     ):
         super().__init__()
         self.use_flex_attn = use_flex_attn
@@ -954,6 +1048,17 @@ def __init__(
             nn.SiLU()
         )
 
+        # hyper connections
+
+        assert num_residual_streams > 0
+        is_hyper_connection = num_residual_streams > 1
+        self.num_residual_streams = num_residual_streams
+
+        counter = count()
+        residual_klass = Residual if num_residual_streams == 1 else HyperConnections
+
+        # layers
+
         layers = ModuleList([])
 
         for ind in range(depth):
@@ -970,7 +1075,10 @@ def __init__(
             attn = AdaptiveWrapper(attn, dim = dim, dim_cond = dim * 4)
             ff = AdaptiveWrapper(ff, dim = dim, dim_cond = dim * 4)
 
-            layers.append(ModuleList([skip_proj, attn, ff]))
+            attn_residual = residual_klass(dim = dim, num_residual_streams = num_residual_streams, layer_id = next(counter))
+            ff_residual = residual_klass(dim = dim, num_residual_streams = num_residual_streams, layer_id = next(counter))
+
+            layers.append(ModuleList([skip_proj, attn, attn_residual, ff, ff_residual]))
 
         self.layers = layers
         self.norm = RMSNorm(dim)
@@ -1060,6 +1168,11 @@ def forward(
         cache = default(cache, (None,))
         iter_cache = iter(cache)
 
+        # expand input into multiple residual streams for maybe hyper connection
+
+        if self.num_residual_streams > 1:
+            x = repeat(x, 'b ... -> (b s) ...', s = self.num_residual_streams)
+
         # transformer layers as usual, using mask from above
 
         skips = []
@@ -1069,7 +1182,7 @@ def forward(
 
         depth = len(self.layers)
 
-        for ind, (skip_proj, attn, ff) in enumerate(self.layers):
+        for ind, (skip_proj, attn, attn_residual, ff, ff_residual) in enumerate(self.layers):
             layer = ind + 1
 
             # skip connection
@@ -1089,6 +1202,8 @@ def forward(
 
             # attention and feedforward
 
+            x, add_attn_residual = attn_residual.prepare_with_inverse(x)
+
             (attn_out, attn_values), kv_cache = attn(
                 x,
                 rotary_emb = rotary_emb,
@@ -1104,8 +1219,18 @@ def forward(
 
             new_cache.append(kv_cache)
 
-            x = attn_out + x
-            x = ff(x, **adaptive_kwargs) + x
+            x = add_attn_residual(attn_out)
+
+            x, add_ff_residual = ff_residual.prepare_with_inverse(x)
+
+            ff_out = ff(x, **adaptive_kwargs)
+
+            x = add_ff_residual(ff_out)
+
+        # reduce multiple residual streams for maybe hyper connection
+
+        if self.num_residual_streams > 1:
+            x = reduce(x, '(b s) ... -> b ...', 'sum', s = self.num_residual_streams)
 
         assert len(skips) == 0