Skip to content

Doc corrections and unicode(theta) #184

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 13 additions & 28 deletions src/activation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,9 @@ end
Segment-wise linear approximation of sigmoid
See: [BinaryConnect: Training Deep Neural Networks withbinary weights during propagations](https://arxiv.org/pdf/1511.00363.pdf)
"""
hardσ(x::Real, a=0.2) = oftype(x/1, max(zero(x/1), min(one(x/1), oftype(x/1,a) * x + oftype(x/1,0.5))))
hardσ(x::Real, a=0.2) = oftype(x / 1, max(zero(x / 1), min(one(x / 1), oftype(x / 1, a) * x + oftype(x / 1, 0.5))))
const hardsigmoid = hardσ


"""
logσ(x)

Expand All @@ -43,16 +42,14 @@ Return `log(σ(x))` which is computed in a numerically stable way.
logσ(x::Real) = -softplus(-x)
const logsigmoid = logσ


"""
hardtanh(x) = max(-1, min(1, x))

Segment-wise linear approximation of tanh. Cheaper and more computational efficient version of tanh.
Segment-wise linear approximation of tanh. Cheaper and more computational efficient version of tanh
See: (http://ronan.collobert.org/pub/matos/2004_phdthesis_lip6.pdf)
"""
hardtanh(x::Real) = max(-one(x), min( one(x), x))


"""
relu(x) = max(0, x)

Expand All @@ -61,15 +58,14 @@ activation function.
"""
relu(x::Real) = max(zero(x), x)


"""
leakyrelu(x, a=0.01) = max(a*x, x)

Leaky [Rectified Linear Unit](https://en.wikipedia.org/wiki/Rectifier_(neural_networks))
activation function.
You can also specify the coefficient explicitly, e.g. `leakyrelu(x, 0.01)`.
"""
leakyrelu(x::Real, a = oftype(x / 1, 0.01)) = max(a * x, x / one(x))
leakyrelu(x::Real, a=0.01) = max(oftype(x / 1, a) * x, x / 1)

"""
relu6(x) = min(max(0, x), 6)
Expand Down Expand Up @@ -102,8 +98,7 @@ Exponential Linear Unit activation function.
See [Fast and Accurate Deep Network Learning by Exponential Linear Units](https://arxiv.org/abs/1511.07289).
You can also specify the coefficient explicitly, e.g. `elu(x, 1)`.
"""
elu(x::Real, α = one(x)) = ifelse(x ≥ 0, x / one(x), α * (exp(x) - one(x)))

elu(x::Real, α = one(x)) = ifelse(x ≥ 0, x / 1, α * (exp(x) - one(x)))

"""
gelu(x) = 0.5x * (1 + tanh(√(2/π) * (x + 0.044715x^3)))
Expand All @@ -119,7 +114,6 @@ function gelu(x::Real)
h * x * (one(x) + tanh(λ * (x + α * x^3)))
end


"""
swish(x) = x * σ(x)

Expand All @@ -128,16 +122,14 @@ See [Swish: a Self-Gated Activation Function](https://arxiv.org/pdf/1710.05941.p
"""
swish(x::Real) = x * σ(x)


"""
lisht(x) = x * tanh(x)

Non-Parametric Linearly Scaled Hyperbolic Tangent Activation Function
Non-Parametric Linearly Scaled Hyperbolic Tangent Activation Function.
See [LiSHT](https://arxiv.org/abs/1901.05894)
"""
lisht(x::Real) = x * tanh(x)


"""
selu(x) = λ * (x ≥ 0 ? x : α * (exp(x) - 1))

Expand All @@ -150,57 +142,50 @@ See [Self-Normalizing Neural Networks](https://arxiv.org/pdf/1706.02515.pdf).
function selu(x::Real)
λ = oftype(x / 1, 1.0507009873554804934193349852946)
α = oftype(x / 1, 1.6732632423543772848170429916717)
λ * ifelse(x > 0, x / one(x), α * (exp(x) - one(x)))
λ * ifelse(x > 0, x / 1, α * (exp(x) - one(x)))
end

"""
celu(x, α=1) =
(x ≥ 0 ? x : α * (exp(x/α) - 1))

Continuously Differentiable Exponential Linear Units
See [Continuously Differentiable Exponential Linear Units](https://arxiv.org/pdf/1704.07483.pdf).
"""
celu(x::Real, α::Real = one(x)) = ifelse(x ≥ 0, x / one(x), α * (exp(x/α) - one(x)))

celu(x::Real, α::Real = one(x)) = ifelse(x ≥ 0, x / 1, α * (exp(x/α) - one(x)))

"""
trelu(x, theta = 1.0) = x > theta ? x : 0
trelu(x, θ=1.0) = x > θ ? x : 0

Threshold Gated Rectified Linear
See [ThresholdRelu](https://arxiv.org/pdf/1402.3337.pdf)
See [Threshold Gated Rectified Linear Unit](https://arxiv.org/pdf/1402.3337.pdf)
"""
trelu(x::Real,theta = one(x)) = ifelse(x> theta, x, zero(x))
trelu(x::Real,θ = one(x)) = ifelse(x> θ, x, zero(x))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
trelu(x::Real,θ = one(x)) = ifelse(x> θ, x, zero(x))
trelu(x::Real, θ=one(x)) = ifelse(x> θ, x, zero(x))

const thresholdrelu = trelu


"""
softsign(x) = x / (1 + |x|)

See [Quadratic Polynomials Learn Better Image Features](http://www.iro.umontreal.ca/~lisa/publications2/index.php/attachments/single/205).
"""
softsign(x::Real) = x / (one(x) + abs(x))


"""
softplus(x) = log(exp(x) + 1)

See [Deep Sparse Rectifier Neural Networks](http://proceedings.mlr.press/v15/glorot11a/glorot11a.pdf).
"""
softplus(x::Real) = ifelse(x > 0, x + log1p(exp(-x)), log1p(exp(x)))


"""
logcosh(x)
logcosh(x) = x + softplus(-2x) - log(2)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the mathematical definition better down in the docstring

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this should be changed for all the existing functions? As in all the rest functions, mathematical definition is written beside the function.


Return `log(cosh(x))` which is computed in a numerically stable way.
"""
logcosh(x::Real) = x + softplus(-2x) - log(oftype(x, 2))


"""
mish(x) = x * tanh(softplus(x))

Self Regularized Non-Monotonic Neural Activation Function
Self Regularized Non-Monotonic Neural Activation Function.
See [Mish: A Self Regularized Non-Monotonic Neural Activation Function](https://arxiv.org/abs/1908.08681).
"""
mish(x::Real) = x * tanh(softplus(x))
Expand All @@ -218,7 +203,7 @@ tanhshrink(x::Real) = x - tanh(x)

See [Softshrink Activation Function](https://www.gabormelli.com/RKB/Softshrink_Activation_Function)
"""
softshrink(x::Real, λ = oftype(x/1, 0.5)) = min(max(zero(x), x - λ), x + λ)
softshrink(x::Real, λ = oftype(x / 1, 0.5)) = min(max(zero(x), x - λ), x + λ)

# Provide an informative error message if activation functions are called with an array
for f in (:σ, :σ_stable, :hardσ, :logσ, :hardtanh, :relu, :leakyrelu, :relu6, :rrelu, :elu, :gelu, :swish, :lisht, :selu, :celu, :trelu, :softsign, :softplus, :logcosh, :mish, :tanhshrink, :softshrink)
Expand Down
6 changes: 2 additions & 4 deletions src/conv.jl
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ export conv, conv!, ∇conv_data, ∇conv_data!, ∇conv_filter, ∇conv_filter!


# First, we will define mappings from the generic API names to our accelerated backend
# implementations. For homogeneous-datatype 1, 2 and 3d convolutions, we default to using
# implementations. For homogeneous-datatype 1d, 2d and 3d convolutions, we default to using
# im2col + GEMM. Do so in a loop, here:
for (front_name, backend) in (
# This maps from public, front-facing name, to internal backend name
Expand Down Expand Up @@ -86,7 +86,7 @@ end

# We always support a fallback, non-accelerated path, where we use the direct, but
# slow, implementations. These should not typically be used, hence the `@debug`,
# but let's ggo ahead and define them first:
# but let's go ahead and define them first:
for front_name in (:conv, :∇conv_data, :∇conv_filter,
:depthwiseconv, :∇depthwiseconv_data, :∇depthwiseconv_filter)
@eval begin
Expand Down Expand Up @@ -179,8 +179,6 @@ function conv(x, w::AbstractArray{T, N}; stride=1, pad=0, dilation=1, flipped=fa
end




"""
depthwiseconv(x, w; stride=1, pad=0, dilation=1, flipped=false)

Expand Down
18 changes: 9 additions & 9 deletions src/gemm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ using LinearAlgebra.BLAS: libblas, BlasInt, @blasfunc

Low-level gemm!() call with pointers, borrowed from Knet.jl

Calculates `C = alpha*op(A)*op(B) + beta*C`, where:
Calculates `C = α*op(A)*op(B) + β*C`, where:
- `transA` and `transB` set `op(X)` to be either `identity()` or `transpose()`
- alpha and beta are scalars
- α and β are scalars
- op(A) is an (M, K) matrix
- op(B) is a (K, N) matrix
- C is an (M, N) matrix.
Expand All @@ -29,8 +29,8 @@ for (gemm, elt) in gemm_datatype_mappings
@eval begin
@inline function gemm!(transA::Val, transB::Val,
M::Int, N::Int, K::Int,
alpha::$(elt), A::Ptr{$elt}, B::Ptr{$elt},
beta::$(elt), C::Ptr{$elt})
α::$(elt), A::Ptr{$elt}, B::Ptr{$elt},
β::$(elt), C::Ptr{$elt})
# Convert our compile-time transpose marker to a char for BLAS
convtrans(V::Val{false}) = 'N'
convtrans(V::Val{true}) = 'T'
Expand All @@ -52,7 +52,7 @@ for (gemm, elt) in gemm_datatype_mappings
Ptr{$elt}, Ref{BlasInt}, Ref{$elt}, Ptr{$elt},
Ref{BlasInt}),
convtrans(transA), convtrans(transB), M, N, K,
alpha, A, lda, B, ldb, beta, C, ldc)
α, A, lda, B, ldb, β, C, ldc)
end
end
end
Expand All @@ -61,10 +61,10 @@ for (gemm, elt) in gemm_datatype_mappings
@eval begin
@inline function batched_gemm!(transA::AbstractChar,
transB::AbstractChar,
alpha::($elt),
α::($elt),
A::AbstractArray{$elt, 3},
B::AbstractArray{$elt, 3},
beta::($elt),
β::($elt),
C::AbstractArray{$elt, 3})
@assert !Base.has_offset_axes(A, B, C)
@assert size(A, 3) == size(B, 3) == size(C, 3) "batch size mismatch"
Expand All @@ -90,8 +90,8 @@ for (gemm, elt) in gemm_datatype_mappings
Ptr{$elt}, Ref{BlasInt}, Ref{$elt}, Ptr{$elt},
Ref{BlasInt}),
transA, transB, m, n,
ka, alpha, ptrA, max(1,Base.stride(A,2)),
ptrB, max(1,Base.stride(B,2)), beta, ptrC,
ka, α, ptrA, max(1,Base.stride(A,2)),
ptrB, max(1,Base.stride(B,2)), β, ptrC,
max(1,Base.stride(C,2)))

ptrA += size(A, 1) * size(A, 2) * sizeof($elt)
Expand Down
32 changes: 16 additions & 16 deletions src/impl/conv_direct.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ function clamp_hi(x, w, L)
end

"""
conv_direct!(y, x, w, cdims; alpha=1, beta=0)
conv_direct!(y, x, w, cdims; α=1, β=0)

Direct convolution implementation; used for debugging, tests, and mixing/matching of
strange datatypes within a single convolution. Uses naive nested for loop implementation
Expand All @@ -29,14 +29,14 @@ so that if the user really wants to convolve an image of `UInt8`'s with a `Float
kernel, storing the result in a `Float32` output, there is at least a function call
for that madness.

The keyword arguments `alpha` and `beta` control accumulation behavior; this function
calculates `y = alpha * x * w + beta * y`, therefore by setting `beta` to a nonzero
value, the user is able to accumulate values into a preallocated `y` buffer, or by
setting `alpha` to a nonunitary value, an arbitrary gain factor can be applied.
The keyword arguments `α` and `β` control accumulation behavior; this function
calculates `y = α * x * w + β * y`, therefore by setting `β` to a non-zero
value, the user is able to accumulate values into a pre-allocated `y` buffer, or by
setting `α` to a non-unitary value, an arbitrary gain factor can be applied.

By defaulting `beta` to `false`, we make use of the Bradbury promotion trick to override
By defaulting `β` to `false`, we make use of the Bradbury promotion trick to override
`NaN`'s that may pre-exist within our output buffer, as `false*NaN == 0.0`, whereas
`0.0*NaN == NaN`. Only set `beta` if you are certain that none of the elements within
`0.0*NaN == NaN`. Only set `β` if you are certain that none of the elements within
`y` are `NaN`.

The basic implementation performs 3-dimensional convolution; 1-dimensional and 2-
Expand All @@ -47,7 +47,7 @@ conv_direct!

function conv_direct!(y::AbstractArray{yT,5}, x::AbstractArray{xT,5},
w::AbstractArray{wT,5}, cdims::DenseConvDims;
alpha::yT = yT(1), beta = false) where {yT, xT, wT}
α::yT = yT(1), β = false) where {yT, xT, wT}
check_dims(size(x), size(w), size(y), cdims)

width, height, depth = input_size(cdims)
Expand Down Expand Up @@ -95,7 +95,7 @@ function conv_direct!(y::AbstractArray{yT,5}, x::AbstractArray{xT,5},
c_in, c_out]
dotprod = muladd(x_val, w_val, dotprod)
end
y[w_idx, h_idx, d_idx, c_out, batch] = alpha*dotprod + beta*y[w_idx, h_idx, d_idx, c_out, batch]
y[w_idx, h_idx, d_idx, c_out, batch] = α*dotprod + β*y[w_idx, h_idx, d_idx, c_out, batch]
end

# Next, do potentially-padded regions:
Expand Down Expand Up @@ -138,47 +138,47 @@ function conv_direct!(y::AbstractArray{yT,5}, x::AbstractArray{xT,5},
end
end

y[w_idx, h_idx, d_idx, c_out, batch] = alpha*dotprod + beta*y[w_idx, h_idx, d_idx, c_out, batch]
y[w_idx, h_idx, d_idx, c_out, batch] = α*dotprod + β*y[w_idx, h_idx, d_idx, c_out, batch]
end

return y
end

## Gradient definitions
"""
∇conv_data_direct!(dx, dy, w, cdims; alpha=1, beta=0)
∇conv_data_direct!(dx, dy, w, cdims; α=1, β=0)

Calculate the gradient imposed upon `x` in the convolution `y = x * w`.
"""
∇conv_data_direct!

function ∇conv_data_direct!(dx::AbstractArray{xT,5}, dy::AbstractArray{yT,5},
w::AbstractArray{wT,5}, cdims::DenseConvDims;
alpha::xT=xT(1), beta=false) where {xT, yT, wT}
α::xT=xT(1), β=false) where {xT, yT, wT}
w = transpose_swapbatch(w[end:-1:1, end:-1:1, end:-1:1, :, :])
dy = predilate(dy, stride(cdims))
ctdims = DenseConvDims(dy, w; padding=transpose_pad(cdims),
dilation=dilation(cdims),
flipkernel=flipkernel(cdims))
dx = conv_direct!(dx, dy, w, ctdims; alpha=alpha, beta=beta)
dx = conv_direct!(dx, dy, w, ctdims; α=α, β=β)
return dx
end

"""
∇conv_filter_direct!(dw, x, dy, cdims; alpha=1, beta=0)
∇conv_filter_direct!(dw, x, dy, cdims; α=1, β=0)

Calculate the gradient imposed upon `w` in the convolution `y = x * w`.
"""
∇conv_filter_direct!

function ∇conv_filter_direct!(dw::AbstractArray{wT,5}, x::AbstractArray{xT,5},
dy::AbstractArray{yT,5}, cdims::DenseConvDims;
alpha::wT=wT(1), beta=false) where {xT, yT, wT}
α::wT=wT(1), β=false) where {xT, yT, wT}
x = transpose_swapbatch(x[end:-1:1, end:-1:1, end:-1:1, :, :])
dy = transpose_swapbatch(predilate(dy, stride(cdims)))
ctdims = DenseConvDims(dy, x; padding=transpose_pad(cdims),
stride=dilation(cdims))
conv_direct!(dw, dy, x, ctdims; alpha=alpha, beta=beta)
conv_direct!(dw, dy, x, ctdims; α=α, β=β)
if flipkernel(cdims)
dw .= dw[end:-1:1, end:-1:1, end:-1:1, :, :]
end
Expand Down
Loading