add new trick from flash attention 2 that saves on division

lucidrains · lucidrains · commit 75469e8834ac · 2023-07-17T17:52:38.000-07:00
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 Implementation of a memory efficient multi-head attention as proposed in the paper, <a href="https://arxiv.org/abs/2112.05682">Self-attention Does Not Need O(n²) Memory</a>. In addition, the module will take care of masking, causal masking, as well as cross attention.
 
-This repository also contains a <a href="https://github.com/lucidrains/memory-efficient-attention-pytorch/blob/main/memory_efficient_attention_pytorch/flash_attention.py">naive non-CUDA implementation</a> of the improvements made by <a href="https://tridao.me/">Tri Dao</a> with his <a href="https://github.com/HazyResearch/flash-attention">Flash Attention</a> paper, for educational purposes. It is a game changer for attention and building long-context transformers.
+This repository also contains a <a href="https://github.com/lucidrains/memory-efficient-attention-pytorch/blob/main/memory_efficient_attention_pytorch/flash_attention.py">naive non-CUDA implementation</a> of the improvements made by <a href="https://tridao.me/">Tri Dao</a> with his <a href="https://github.com/HazyResearch/flash-attention">Flash Attention 2</a> paper, for educational purposes. It is a game changer for attention and building long-context transformers.
 
 Update: from now on, you should just be using the <a href="https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html?highlight=scaled_dot_product#torch.nn.functional.scaled_dot_product_attention">`F.scaled_dot_product_attention`</a> function in Pytorch 2.0
 
@@ -89,3 +89,11 @@ out = cross_attn(x, context = context, mask = mask) # (1, 65536, 512)
     volume  = {abs/2205.14135}
 }
 ```
+
+```bibtex
+@article{dao2023flashattention2,
+  title     = {Flash{A}ttention-2: Faster Attention with Better Parallelism and Work Partitioning,
+  author    = {Dao, Tri},
+  year      = {2023}
+}
+```
diff --git a/memory_efficient_attention_pytorch/flash_attention.py b/memory_efficient_attention_pytorch/flash_attention.py
@@ -98,11 +98,13 @@ def forward(ctx, q, k, v, mask, causal, q_bucket_size, k_bucket_size):
 
                 new_row_sums = exp_row_max_diff * row_sums + exp_block_row_max_diff * block_row_sums
 
-                oc.mul_((row_sums / new_row_sums) * exp_row_max_diff).add_((exp_block_row_max_diff / new_row_sums) * exp_values)
+                oc.mul_(exp_row_max_diff).add_(exp_block_row_max_diff * exp_values)
 
                 row_maxes.copy_(new_row_maxes)
                 row_sums.copy_(new_row_sums)
 
+            oc.div_(row_sums)
+
         lse = all_row_sums.log() + all_row_maxes
 
         ctx.args = (causal, scale, mask, q_bucket_size, k_bucket_size)
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'memory-efficient-attention-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.1.4',
+  version = '0.1.5',
   license='MIT',
   description = 'Memory Efficient Attention - Pytorch',
   long_description_content_type = 'text/markdown',