Added full attention option

tatp22 · tatp22 · commit 4f5b68d8e679 · 2020-06-27T19:40:31.000+02:00
diff --git a/README.md b/README.md
@@ -47,6 +47,7 @@ model = Linformer(
         checkpoint_level="C0", # What checkpoint level to use. For more information, see below.
         parameter_sharing="layerwise", # What level of parameter sharing to use. For more information, see below.
         k_reduce_by_layer=0, # Going down `depth`, how much to reduce `dim_k` by, for the `E` and `F` matrices. Will have a minimum value of 1.
+        full_attention=False, # Use full attention instead, for O(n^2) time and space complexity. Included here just for comparison
         ).cuda()
 x = torch.randn(1, 262144, 64).cuda()
 y = model(x)
@@ -70,6 +71,7 @@ model = MHAttention(
         checkpoint_level="C2", # If C2, checkpoint each of the heads
         parameter_sharing="layerwise", # What level of parameter sharing to do
         E_proj, F_proj, # The E and F projection matrices
+        full_attention=False, # Use full attention instead
         )
 x = torch.randn(1, 512, 64)
 y = model(x)
@@ -85,7 +87,8 @@ import torch
 model = LinearAttentionHead(
         dim=64, # Dim 2 of the input
         dropout=0.1, # Dropout of the P matrix
-        E_proj, F_proj # The E and F layers
+        E_proj, F_proj, # The E and F layers
+        full_attention=False, # Use Full Attention instead
         )
 x = torch.randn(1, 512, 64)
 y = model(x, x, x)
diff --git a/examples/example_full_attn.py b/examples/example_full_attn.py
@@ -0,0 +1,21 @@
+import sys
+import torch
+
+sys.path.insert(0, "../")
+from linformer_pytorch import Linformer
+
+model = Linformer(
+        input_size=512,
+        channels=16,
+        dim_k=16,
+        dim_ff=32,
+        nhead=4,
+        depth=3,
+        activation="relu",
+        checkpoint_level="C1",
+        parameter_sharing="none",
+        k_reduce_by_layer=1,
+        )
+x = torch.randn(1, 512, 16)
+y = model(x)
+print(y) # (1, 512, 16)
diff --git a/examples/pretrain_tutorial.py b/examples/pretrain_tutorial.py
@@ -11,7 +11,7 @@
     batch_size=16,
     lr=0.1,
     no_cuda=True,
-    num_epochs=10,
+    num_epochs=30,
     output_dir="./output",
     seed=2222,
 
diff --git a/linformer_pytorch/linformer_pytorch.py b/linformer_pytorch/linformer_pytorch.py
@@ -60,7 +60,7 @@ class LinearAttentionHead(nn.Module):
     """
     Linear attention, as proposed by the linformer paper
     """
-    def __init__(self, dim, dropout, E_proj, F_proj):
+    def __init__(self, dim, dropout, E_proj, F_proj, full_attention=False):
         super(LinearAttentionHead, self).__init__()
         self.w_k = nn.Linear(dim, dim)
         self.w_q = nn.Linear(dim, dim)
@@ -70,6 +70,7 @@ def __init__(self, dim, dropout, E_proj, F_proj):
         self.dim = dim
         self.dropout = nn.Dropout(dropout)
         self.P_bar = None
+        self.full_attention = full_attention
 
     def forward(self, Q, K, V, **kwargs):
         """
@@ -78,23 +79,27 @@ def forward(self, Q, K, V, **kwargs):
         """
         KW = self.w_k(K)
         KW = torch.transpose(KW, 1, 2)
-        KW = self.E(KW)
+        if not self.full_attention:
+            KW = self.E(KW)
         QW = self.w_q(Q)
         QW = torch.matmul(QW, KW)
 
         P_bar = QW/torch.sqrt(torch.tensor(self.dim).type(Q.type()))
         P_bar = P_bar.softmax(dim=-1)
 
+        print(P_bar.shape)
         # Only save this when visualizing
         if "visualize" in kwargs and kwargs["visualize"] == True:
             self.P_bar = P_bar
 
         P_bar = self.dropout(P_bar)
 
         VW = self.w_v(V)
-        VW = torch.transpose(VW, 1, 2)
-        VW = self.F(VW)
-        VW = torch.transpose(VW, 1, 2)
+
+        if not self.full_attention:
+            VW = torch.transpose(VW, 1, 2)
+            VW = self.F(VW)
+            VW = torch.transpose(VW, 1, 2)
         out_tensor = torch.matmul(P_bar, VW)
 
         return out_tensor
@@ -104,7 +109,7 @@ class MHAttention(nn.Module):
     Multihead attention, with each head being a Linformer Head
     This feeds directly into a feed forward head
     """
-    def __init__(self, input_size, dim, channels, dim_k, nhead, dropout, activation, checkpoint_level, parameter_sharing, E_proj, F_proj):
+    def __init__(self, input_size, dim, channels, dim_k, nhead, dropout, activation, checkpoint_level, parameter_sharing, E_proj, F_proj, full_attention):
         super(MHAttention, self).__init__()
         self.heads = nn.ModuleList()
         self.input_size = input_size
@@ -118,7 +123,7 @@ def __init__(self, input_size, dim, channels, dim_k, nhead, dropout, activation,
             if parameter_sharing == "none":
                 E_proj = get_EF(input_size, dim_k)
                 F_proj = get_EF(input_size, dim_k)
-            attn = LinearAttentionHead(dim, dropout, E_proj, F_proj)
+            attn = LinearAttentionHead(dim, dropout, E_proj, F_proj, full_attention)
             self.heads.append(attn)
         self.w_o = nn.Linear(dim*nhead, channels)
         self.to_q = nn.Linear(channels, dim, bias=False)
@@ -147,7 +152,7 @@ class Linformer(nn.Module):
     My attempt at reproducing the Linformer Paper
     https://arxiv.org/pdf/2006.04768.pdf
     """
-    def __init__(self, input_size=8192, channels=128, dim_k=64, dim_ff=256, dim_d=None, dropout_ff=0.15, nhead=4, depth=1, dropout=0.1, activation="gelu", use_pos_emb=True, checkpoint_level="C0", parameter_sharing="layerwise", k_reduce_by_layer=0):
+    def __init__(self, input_size=8192, channels=128, dim_k=64, dim_ff=256, dim_d=None, dropout_ff=0.15, nhead=4, depth=1, dropout=0.1, activation="gelu", use_pos_emb=True, checkpoint_level="C0", parameter_sharing="layerwise", k_reduce_by_layer=0, full_attention=False):
         super(Linformer, self).__init__()
         assert activation == "gelu" or activation == "relu", "Only gelu and relu activations supported for now"
         assert checkpoint_level == "C0" or checkpoint_level == "C1" or checkpoint_level == "C2", "Checkpoint level has to be either C0, C1, or C2."
@@ -167,7 +172,7 @@ def __init__(self, input_size=8192, channels=128, dim_k=64, dim_ff=256, dim_d=No
         self.E = get_EF(input_size, dim_k)
         self.F = self.E
 
-        get_attn = lambda curr_dim_k: MHAttention(input_size, head_dim, channels, curr_dim_k, nhead, dropout, activation, checkpoint_level, parameter_sharing, self.E, self.F)
+        get_attn = lambda curr_dim_k: MHAttention(input_size, head_dim, channels, curr_dim_k, nhead, dropout, activation, checkpoint_level, parameter_sharing, self.E, self.F, full_attention)
         get_ff = lambda: FeedForward(channels, dim_ff, dropout_ff)
         norm_attn = lambda: nn.LayerNorm(channels)
         norm_ff = lambda: nn.LayerNorm(channels)