complete flash attention algorithm in plain pytorch (for educational purposes, performant version will be at https://github.com/HazyResearch/flash-attention)

lucidrains · lucidrains · commit 804a202ee04f · 2022-07-23T12:38:49.000-07:00
diff --git a/README.md b/README.md
@@ -52,9 +52,6 @@ mask = torch.ones(1, 65536).bool().cuda()
 out = cross_attn(x, context = context, mask = mask) # (1, 65536, 512)
 ```
 
-- [ ] benchmark and see how much torch jit helps
-- [ ] look at Triton and Keops and see if either can be a fit
-
 ## Citations
 
 ```bibtex
@@ -78,3 +75,13 @@ out = cross_attn(x, context = context, mask = mask) # (1, 65536, 512)
     primaryClass = {cs.CV}
 }
 ```
+
+```bibtex
+@article{Dao2022FlashAttentionFA,
+    title   = {FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness},
+    author  = {Tri Dao and Daniel Y. Fu and Stefano Ermon and Atri Rudra and Christopher R'e},
+    journal = {ArXiv},
+    year    = {2022},
+    volume  = {abs/2205.14135}
+}
+```
diff --git a/memory_efficient_attention_pytorch/flash_attention.py b/memory_efficient_attention_pytorch/flash_attention.py
@@ -0,0 +1,226 @@
+import math
+import torch
+from functools import partial
+from torch import nn, einsum
+from torch.autograd.function import Function
+
+from einops import rearrange
+
+# constants
+
+EPSILON = 1e-6
+
+# helper functions
+
+def exists(val):
+    return val is not None
+
+def default(val, d):
+    return val if exists(val) else d
+
+# flash attention forwards and backwards
+
+class FlashAttentionFunction(Function):
+    @staticmethod
+    @torch.no_grad()
+    def forward(ctx, q, k, v, mask, causal, q_bucket_size, k_bucket_size):
+        device = q.device
+        max_neg_value = -torch.finfo(q.dtype).max
+        qk_len_diff = max(k.shape[-2] - q.shape[-2], 0)
+
+        o = torch.zeros_like(q)
+        all_row_sums = torch.zeros((*q.shape[:-1], 1), device = device)
+        all_row_maxes = torch.full((*q.shape[:-1], 1), max_neg_value, device = device)
+
+        scale = (q.shape[-1] ** -0.5)
+        q = q * scale
+
+        if not exists(mask):
+            mask = (None,) * math.ceil(q.shape[-2] / q_bucket_size)
+        else:
+            mask = mask.split(q_bucket_size, dim = -2)
+
+        row_splits = zip(
+            q.split(q_bucket_size, dim = -2),
+            o.split(q_bucket_size, dim = -2),
+            mask,
+            all_row_sums.split(q_bucket_size, dim = -2),
+            all_row_maxes.split(q_bucket_size, dim = -2),
+        )
+
+        for ind, (qc, oc, row_mask, row_sums, row_maxes) in enumerate(row_splits):
+            q_start_index = ind * q_bucket_size - qk_len_diff
+
+            col_splits = zip(
+                k.split(k_bucket_size, dim = -2),
+                v.split(k_bucket_size, dim = -2),
+            )
+
+            for k_ind, (kc, vc) in enumerate(col_splits):
+                k_start_index = k_ind * k_bucket_size
+
+                attn_weights = einsum('... i d, ... j d -> ... i j', qc, kc)
+
+                if exists(row_mask):
+                    attn_weights.masked_fill_(~row_mask, max_neg_value)
+
+                if causal and q_start_index < (k_start_index + k_bucket_size - 1):
+                    causal_mask = torch.ones((q_bucket_size, k_bucket_size), dtype = torch.bool, device = device).triu(q_start_index - k_start_index + 1)
+                    attn_weights.masked_fill_(causal_mask, max_neg_value)
+
+                block_row_maxes = attn_weights.amax(dim = -1, keepdims = True)
+
+                attn_weights -= block_row_maxes
+                exp_weights = torch.exp(attn_weights)
+
+                if exists(row_mask):
+                    exp_weights.masked_fill_(~row_mask, 0.)
+
+                block_row_sums = exp_weights.sum(dim = -1, keepdims = True).clamp(min = EPSILON)
+
+                new_row_maxes = torch.maximum(block_row_maxes, row_sums)
+
+                exp_values = einsum('... i j, ... j d -> ... i d', exp_weights, vc)
+
+                exp_row_max_diff = torch.exp(row_maxes - new_row_maxes)
+                exp_block_row_max_diff = torch.exp(block_row_maxes - new_row_maxes)
+
+                new_row_sums = exp_row_max_diff * row_sums + exp_block_row_max_diff * block_row_sums
+
+                out = (row_sums / new_row_sums) * exp_row_max_diff * oc + \
+                      (exp_block_row_max_diff / new_row_sums) * exp_values
+
+                oc.copy_(out)
+                row_maxes.copy_(new_row_maxes)
+                row_sums.copy_(new_row_sums)
+
+        ctx.args = (causal, mask, q_bucket_size, k_bucket_size)
+        ctx.save_for_backward(q, k, v, o, all_row_sums, all_row_maxes)
+
+        return o
+
+    @staticmethod
+    @torch.no_grad()
+    def backward(ctx, do):
+        causal, mask, q_bucket_size, k_bucket_size = ctx.args
+        q, k, v, o, l, m = ctx.saved_tensors
+
+        device = q.device
+
+        max_neg_value = -torch.finfo(q.dtype).max
+        qk_len_diff = max(k.shape[-2] - q.shape[-2], 0)
+
+        dq = torch.zeros_like(q)
+        dk = torch.zeros_like(k)
+        dv = torch.zeros_like(v)
+
+        scale = q.shape[-1] ** -0.5
+
+        row_splits = zip(
+            q.split(q_bucket_size, dim = -2),
+            o.split(q_bucket_size, dim = -2),
+            do.split(q_bucket_size, dim = -2),
+            mask,
+            l.split(q_bucket_size, dim = -2),
+            m.split(q_bucket_size, dim = -2),
+            dq.split(q_bucket_size, dim = -2)
+        )
+
+        for ind, (qc, oc, doc, row_mask, lc, mc, dqc) in enumerate(row_splits):
+            q_start_index = ind * q_bucket_size - qk_len_diff
+
+            col_splits = zip(
+                k.split(k_bucket_size, dim = -2),
+                v.split(k_bucket_size, dim = -2),
+                dk.split(k_bucket_size, dim = -2),
+                dv.split(k_bucket_size, dim = -2),
+            )
+
+            for k_ind, (kc, vc, dkc, dvc) in enumerate(col_splits):
+                k_start_index = k_ind * k_bucket_size
+
+                qc_scaled = qc * scale
+                attn_weights = einsum('... i d, ... j d -> ... i j', qc_scaled, kc)
+
+                if causal and q_start_index < (k_start_index + k_bucket_size - 1):
+                    causal_mask = torch.ones((q_bucket_size, k_bucket_size), dtype = torch.bool, device = device).triu(q_start_index - k_start_index + 1)
+                    attn_weights.masked_fill_(causal_mask, max_neg_value)
+
+                exp_attn_weights = torch.exp(attn_weights - mc)
+
+                if exists(row_mask):
+                    exp_attn_weights.masked_fill_(~row_mask, 0.)
+
+                p = exp_attn_weights / lc
+
+                dv_chunk = einsum('... i j, ... i d -> ... j d', p, doc)
+                dp = einsum('... i d, ... j d -> ... i j', doc, vc)
+
+                D = (do * o).sum(dim = -1, keepdims = True)
+                ds = p * scale * (dp - D)
+
+                dq_chunk = einsum('... i j, ... j d -> ... i d', ds, kc)
+                dk_chunk = einsum('... i j, ... i d -> ... j d', ds, qc)
+
+                dqc.add_(dq_chunk)
+                dkc.add_(dk_chunk)
+                dvc.add_(dv_chunk)
+
+        return dq, dk, dv, None, None, None, None
+
+# main class
+
+# just flash attention in plain pytorch
+# it will be way slower than implementing it in CUDA
+# for tinkering and educational purposes
+
+class FlashAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        heads = 8,
+        dim_head = 64,
+        causal = False,
+        q_bucket_size = 512,
+        k_bucket_size = 1024
+    ):
+        super().__init__()
+        self.heads = heads
+
+        self.causal = causal
+
+        inner_dim = heads * dim_head
+
+        self.to_q = nn.Linear(dim, inner_dim, bias = False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias = False)
+        self.to_out = nn.Linear(inner_dim, dim)
+
+        # memory efficient attention related parameters
+        # can be overriden on forward
+        self.q_bucket_size = q_bucket_size
+        self.k_bucket_size = k_bucket_size
+
+    def forward(
+        self,
+        x,
+        context = None,
+        mask = None,
+        q_bucket_size = None,
+        k_bucket_size = None,
+    ):
+        q_bucket_size = default(q_bucket_size, self.q_bucket_size)
+        k_bucket_size = default(k_bucket_size, self.k_bucket_size)
+
+        h = self.heads
+        context = default(context, x)
+
+        q = self.to_q(x)
+        k, v = self.to_kv(context).chunk(2, dim = -1)
+
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
+
+        out = FlashAttentionFunction.apply(q, k, v, mask, self.causal, q_bucket_size, k_bucket_size)
+
+        out = rearrange(out, 'b h n d -> b n (h d)')
+        return self.to_out(out)
diff --git a/setup.py b/setup.py
@@ -3,9 +3,10 @@
 setup(
   name = 'memory-efficient-attention-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.0.17',
+  version = '0.0.18',
   license='MIT',
   description = 'Memory Efficient Attention - Pytorch',
+  long_description_content_type = 'text/markdown',
   author = 'Phil Wang',
   author_email = 'lucidrains@gmail.com',
   url = 'https://github.com/lucidrains/memory-efficient-attention-pytorch',