max for numerical stability should be taken after masking

lucidrains · lucidrains · commit 9ecb55a2c992 · 2022-03-04T14:44:30.000-08:00
diff --git a/memory_efficient_attention_pytorch/memory_efficient_attention.py b/memory_efficient_attention_pytorch/memory_efficient_attention.py
@@ -30,7 +30,6 @@ def attention(
     if exists(attn_bias):
         sim = sim + attn_bias
 
-    sim = sim - sim.amax(dim = -1, keepdim = True).detach()
     mask_value = -torch.finfo(sim.dtype).max
 
     if exists(mask):
@@ -42,6 +41,7 @@ def attention(
         mask = torch.ones(i, j, device = q.device, dtype = torch.bool).triu(j - i + 1)
         sim = sim.masked_fill(mask, mask_value)
 
+    sim = sim - sim.amax(dim = -1, keepdim = True).detach()
     attn = sim.softmax(dim = -1)
 
     out = einsum('b h i j, b h j d -> b h i d', attn, v)
@@ -55,9 +55,6 @@ def summarize_qkv_chunk(q, k, v, mask, causal_mask, attn_bias_chunk):
     if exists(attn_bias_chunk):
         weight = weight + attn_bias_chunk
 
-    weight_max = weight.amax(dim = -1, keepdim = True).detach()
-    weight = weight - weight_max
-
     mask_value = -torch.finfo(weight.dtype).max
 
     if exists(mask):
@@ -67,6 +64,9 @@ def summarize_qkv_chunk(q, k, v, mask, causal_mask, attn_bias_chunk):
     if exists(causal_mask):
         weight = weight.masked_fill(causal_mask, mask_value)
 
+    weight_max = weight.amax(dim = -1, keepdim = True).detach()
+    weight = weight - weight_max
+
     exp_weight = weight.exp()
     weighted_value = einsum('b h i j, b h j d -> b h i d', exp_weight, v)
 
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'memory-efficient-attention-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.0.12',
+  version = '0.0.14',
   license='MIT',
   description = 'Memory Efficient Attention - Pytorch',
   author = 'Phil Wang',
@@ -15,7 +15,7 @@
     'attention-mechanism'
   ],
   install_requires=[
-    'einops>=0.3,<0.4',
+    'einops>=0.4.1',
     'torch>=1.6'    
   ],
   setup_requires=[