fix bugs

lucidrains · lucidrains · commit 7ce4d634a05a · 2022-07-23T12:49:33.000-07:00
diff --git a/memory_efficient_attention_pytorch/flash_attention.py b/memory_efficient_attention_pytorch/flash_attention.py
@@ -65,7 +65,7 @@ def forward(ctx, q, k, v, mask, causal, q_bucket_size, k_bucket_size):
                     attn_weights.masked_fill_(~row_mask, max_neg_value)
 
                 if causal and q_start_index < (k_start_index + k_bucket_size - 1):
-                    causal_mask = torch.ones((q_bucket_size, k_bucket_size), dtype = torch.bool, device = device).triu(q_start_index - k_start_index + 1)
+                    causal_mask = torch.ones((qc.shape[-2], kc.shape[-2]), dtype = torch.bool, device = device).triu(q_start_index - k_start_index + 1)
                     attn_weights.masked_fill_(causal_mask, max_neg_value)
 
                 block_row_maxes = attn_weights.amax(dim = -1, keepdims = True)
@@ -143,7 +143,7 @@ def backward(ctx, do):
                 attn_weights = einsum('... i d, ... j d -> ... i j', qc_scaled, kc)
 
                 if causal and q_start_index < (k_start_index + k_bucket_size - 1):
-                    causal_mask = torch.ones((q_bucket_size, k_bucket_size), dtype = torch.bool, device = device).triu(q_start_index - k_start_index + 1)
+                    causal_mask = torch.ones((qc.shape[-2], kc.shape[-2]), dtype = torch.bool, device = device).triu(q_start_index - k_start_index + 1)
                     attn_weights.masked_fill_(causal_mask, max_neg_value)
 
                 exp_attn_weights = torch.exp(attn_weights - mc)
@@ -156,7 +156,7 @@ def backward(ctx, do):
                 dv_chunk = einsum('... i j, ... i d -> ... j d', p, doc)
                 dp = einsum('... i d, ... j d -> ... i j', doc, vc)
 
-                D = (do * o).sum(dim = -1, keepdims = True)
+                D = (doc * oc).sum(dim = -1, keepdims = True)
                 ds = p * scale * (dp - D)
 
                 dq_chunk = einsum('... i j, ... j d -> ... i d', ds, kc)
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'memory-efficient-attention-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.0.18',
+  version = '0.0.19',
   license='MIT',
   description = 'Memory Efficient Attention - Pytorch',
   long_description_content_type = 'text/markdown',