add layerscale, from the recent training deeper vision transformers paper, as a ward against non-convergence at greater depths

lucidrains · lucidrains · commit b6ebcebea691 · 2021-04-01T11:41:44.000-07:00
diff --git a/dalle_pytorch/transformer.py b/dalle_pytorch/transformer.py
@@ -24,6 +24,23 @@ def cast_tuple(val, depth = 1):
 
 # classes
 
+# https://arxiv.org/abs/2103.17239
+class LayerScale(nn.Module):
+    def __init__(self, dim, depth, fn):
+        super().__init__()
+        if depth <= 18:
+            init_eps = 0.1
+        elif depth > 18 and depth <= 24:
+            init_eps = 1e-5
+        else:
+            init_eps = 1e-6
+
+        scale = torch.zeros(1, 1, dim).fill_(init_eps)
+        self.scale = nn.Parameter(scale)
+        self.fn = fn
+    def forward(self, x, **kwargs):
+        return self.fn(x, **kwargs) * self.scale
+
 class PreNorm(nn.Module):
     def __init__(self, dim, fn):
         super().__init__()
@@ -77,7 +94,7 @@ def __init__(
         attn_types = cast_tuple(attn_types)
         attn_type_layer = islice(cycle(attn_types), depth)
 
-        for _, sparse_attn, attn_type in zip(range(depth), sparse_layer, attn_type_layer):
+        for ind, sparse_attn, attn_type in zip(range(depth), sparse_layer, attn_type_layer):
             if attn_type == 'full':
                 attn_class = Attention
             elif attn_type == 'sparse':
@@ -92,8 +109,8 @@ def __init__(
                 raise ValueError(f'attention type "{attn_type}" is not valid')
 
             layers.append(nn.ModuleList([
-                PreNorm(dim, attn_class(dim, causal = causal, seq_len = seq_len, heads = heads, dim_head = dim_head, dropout = attn_dropout)),
-                PreNorm(dim, FeedForward(dim, mult = ff_mult, dropout = ff_dropout))
+                LayerScale(dim, ind + 1, PreNorm(dim, attn_class(dim, causal = causal, seq_len = seq_len, heads = heads, dim_head = dim_head, dropout = attn_dropout))),
+                LayerScale(dim, ind + 1, PreNorm(dim, FeedForward(dim, mult = ff_mult, dropout = ff_dropout)))
             ]))
 
         execute_type = ReversibleSequence if reversible else SequentialSequence
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
   name = 'dalle-pytorch',
   packages = find_packages(),
   include_package_data = True,
-  version = '0.7.3',
+  version = '0.8.0',
   license='MIT',
   description = 'DALL-E - Pytorch',
   author = 'Phil Wang',