chunk feedforward for entirely memory efficient transformer

lucidrains · lucidrains · commit 9db7dc85375a · 2022-03-13T13:36:48.000-07:00
diff --git a/memory_efficient_attention_pytorch/transformer.py b/memory_efficient_attention_pytorch/transformer.py
@@ -19,12 +19,24 @@ def forward(self, x, **kwargs):
         x = self.norm(x)
         return self.fn(x, **kwargs)
 
-def FeedForward(dim, mult = 4):
-    return nn.Sequential(
-        nn.Linear(dim, dim * mult),
-        nn.GELU(),
-        nn.Linear(dim * mult, dim)
-    )
+class FeedForward(nn.Module):
+    def __init__(self, dim, mult = 4, chunks = 1):
+        super().__init__()
+        self.chunks = chunks
+
+        self.net = nn.Sequential(
+            nn.Linear(dim, dim * mult),
+            nn.GELU(),
+            nn.Linear(dim * mult, dim)
+        )
+
+    def forward(self, x):
+        if self.chunks <= 1:
+            return self.net(x)
+
+        chunks = x.chunk(self.chunks, dim = 1)
+        out = [self.net(chunk) for chunk in chunks]
+        return torch.cat(out, dim = 1)
 
 class Transformer(nn.Module):
     def __init__(
@@ -38,6 +50,7 @@ def __init__(
         dim_head = 64,
         heads = 8,
         ff_mult = 4,
+        ff_chunks = 1,
         **kwargs
     ):
         super().__init__()
@@ -50,7 +63,7 @@ def __init__(
         for _ in range(depth):
             self.layers.append(nn.ModuleList([
                 PreNorm(dim, Attention(dim = dim, dim_head = dim_head, heads = heads, causal = causal, **kwargs)),
-                PreNorm(dim, FeedForward(dim = dim, mult = ff_mult)),
+                PreNorm(dim, FeedForward(dim = dim, mult = ff_mult, chunks = ff_chunks)),
             ]))
 
         self.net = ReversibleSequence(self.layers)
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'memory-efficient-attention-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.0.14',
+  version = '0.0.15',
   license='MIT',
   description = 'Memory Efficient Attention - Pytorch',
   author = 'Phil Wang',
diff --git a/train.py b/train.py
@@ -18,7 +18,7 @@
 LEARNING_RATE = 2e-4
 VALIDATE_EVERY  = 100
 GENERATE_EVERY  = 500
-GENERATE_LENGTH = 2048
+GENERATE_LENGTH = 4096
 SEQ_LEN = 4096
 
 # helpers
@@ -40,12 +40,13 @@ def decode_tokens(tokens):
     num_tokens = 256,
     dim = 512,
     max_seq_len = SEQ_LEN,
-    depth = 8,
+    depth = 6,
     heads = 8,
     causal = True,
     memory_efficient = True,
-    q_bucket_size = 512,
-    k_bucket_size = 512
+    q_bucket_size = 256,
+    k_bucket_size = 256,
+    ff_chunks = 5
 )
 
 model = AutoregressiveWrapper(model)