Merge pull request #19 from lucidrains/prefix-full-attention

lucidrains · web-flow · commit 9a9c3705e849 · 2021-01-17T20:17:29.000-08:00
add ability to specify full attention for a prefix length of the sequ…
diff --git a/dalle_pytorch/dalle_pytorch.py b/dalle_pytorch/dalle_pytorch.py
@@ -254,7 +254,8 @@ def __init__(
         reversible = False,
         attn_dropout = 0.,
         ff_dropout = 0,
-        sparse_attn = False
+        sparse_attn = False,
+        noncausal_attn_len = 0,
     ):
         super().__init__()
         assert isinstance(vae, DiscreteVAE), 'vae must be an instance of DiscreteVAE'
@@ -294,6 +295,7 @@ def __init__(
             reversible = reversible,
             attn_dropout = attn_dropout,
             ff_dropout = ff_dropout,
+            noncausal_attn_len = noncausal_attn_len,
             sparse_attn = sparse_attn,
             sparse_attn_global_indices = range(text_seq_len)
         )
diff --git a/dalle_pytorch/transformer.py b/dalle_pytorch/transformer.py
@@ -54,13 +54,15 @@ def forward(self, x):
         return self.net(x)
 
 class Attention(nn.Module):
-    def __init__(self, dim, seq_len, causal = True, heads = 8, dim_head = 64, dropout = 0.):
+    def __init__(self, dim, seq_len, causal = True, heads = 8, dim_head = 64, dropout = 0., noncausal_attn_len = 0):
         super().__init__()
         inner_dim = dim_head *  heads
         self.heads = heads
         self.seq_len = seq_len
         self.scale = dim ** -0.5
+
         self.causal = causal
+        self.noncausal_attn_len = noncausal_attn_len
 
         self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
         self.to_out = nn.Sequential(
@@ -84,6 +86,11 @@ def forward(self, x, mask = None):
         if self.causal:
             i, j = dots.shape[-2:]
             mask = torch.ones(i, j, device = device).triu_(j - i + 1).bool()
+
+            if self.noncausal_attn_len > 0:
+                ind = slice(0, self.noncausal_attn_len)
+                mask[ind, ind] = False
+
             dots.masked_fill_(mask, mask_value)
 
         attn = dots.softmax(dim=-1)
@@ -146,6 +153,10 @@ def forward(self, x, mask = None):
             mask_value = -(torch.finfo(q.dtype).max / 2)
             attn_mask.masked_fill_(mask, mask_value)
 
+            if self.noncausal_attn_len:
+                ind = slice(0, self.noncausal_attn_len)
+                attn_mask[ind, ind] = 0.
+
         out = self.attn_fn(q, k, v, attn_mask = attn_mask, key_padding_mask = key_pad_mask)
         out = rearrange(out, 'b h n d -> b n (h d)')
         out = self.to_out(out)
@@ -165,6 +176,7 @@ def __init__(
         ff_mult = 4,
         attn_dropout = 0.,
         ff_dropout = 0.,
+        noncausal_attn_len = 0,
         sparse_attn = True,
         sparse_attn_global_indices = []
     ):
@@ -176,7 +188,7 @@ def __init__(
             attn_class = Attention if not sparse_attn else partial(SparseAttention, sparse_attn_global_indices = sparse_attn_global_indices)
 
             layers.append(nn.ModuleList([
-                PreNorm(dim, attn_class(dim, causal = causal, seq_len = seq_len, heads = heads, dim_head = dim_head, dropout = attn_dropout)),
+                PreNorm(dim, attn_class(dim, causal = causal, seq_len = seq_len, heads = heads, dim_head = dim_head, dropout = attn_dropout, noncausal_attn_len = noncausal_attn_len)),
                 PreNorm(dim, FeedForward(dim, mult = ff_mult, dropout = ff_dropout))
             ]))