We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 35354fa commit 7d92267Copy full SHA for 7d92267
ch04/08_deltanet/README.md
@@ -166,7 +166,8 @@ class GatedDeltaNet(nn.Module):
166
# A_log + W_alpha(x) + dt_bias
167
self.W_alpha = nn.Linear(d_in, num_heads, bias=False)
168
self.dt_bias = nn.Parameter(torch.ones(num_heads))
169
- self.A_log = nn.Parameter(torch.zeros(num_heads))
+ A_init = torch.empty(num_heads).uniform_(0, 16)
170
+ self.A_log = nn.Parameter(torch.log(A_init))
171
# We could implement this as
172
# W_alpha = nn.Linear(d_in, num_heads, bias=True)
173
# but the bias is separate for interpretability and
0 commit comments