Merge #1427

bors[bot] · sambitdash · gxyd · web-flow · commit 6b9f4ec163c9 · 2020-12-18T16:36:06.000Z
1427: (Complete) Implementation of label smoothing with crossentropy r=CarloLucibello a=gxyd Trying to complete the PR #1025 from @sambitdash (Thanks Sambit). Closes #1016 A few changes compared to the original code - Throwing an error when `label_smoothing` isn't between 0 and 1. - Label smoothing is applied as a dispatch. Co-authored-by: Sambit Kumar Dash <sambitdash@gmail.com> Co-authored-by: Gaurav Dhingra <gauravdhingra.gxyd@gmail.com>
diff --git a/docs/src/models/losses.md b/docs/src/models/losses.md
@@ -28,6 +28,7 @@ Flux.Losses.mae
 Flux.Losses.mse
 Flux.Losses.msle
 Flux.Losses.huber_loss
+Flux.Losses.label_smoothing
 Flux.Losses.crossentropy
 Flux.Losses.logitcrossentropy
 Flux.Losses.binarycrossentropy
diff --git a/src/losses/Losses.jl b/src/losses/Losses.jl
@@ -9,6 +9,7 @@ using NNlib: logsoftmax, logσ
 import Base.Broadcast: broadcasted
 
 export mse, mae, msle,
+    label_smoothing,
     crossentropy, logitcrossentropy,
     # binarycrossentropy, logitbinarycrossentropy # export only after end deprecation
     kldivergence,
diff --git a/src/losses/functions.jl b/src/losses/functions.jl
@@ -46,6 +46,53 @@ function huber_loss(ŷ, y; agg=mean, δ=ofeltype(ŷ, 1))
    agg(((abs_error.^2) .* temp) .* x .+ δ*(abs_error .- x*δ) .* (1 .- temp))
 end
 
+"""
+    label_smoothing(y::Union{Number, AbstractArray}, α; dims::Int=1)
+
+Returns smoothed labels, meaning the confidence on label values are relaxed.
+
+When `y` is given as one-hot vector or batch of one-hot, its calculated as
+
+    y .* (1 - α) .+ α / size(y, dims)
+
+when `y` is given as a number or batch of numbers for binary classification,
+its calculated as
+
+    y .* (1 - α) .+ α / 2
+
+in which case the labels are squeezed towards `0.5`.
+
+α is a number in interval (0, 1) called the smoothing factor. Higher the
+value of α larger the smoothing of `y`.
+
+`dims` denotes the one-hot dimension, unless `dims=0` which denotes the application
+of label smoothing to binary distributions encoded in a single number.
+
+Usage example:
+
+    sf = 0.1
+    y = onehotbatch([1, 1, 1, 0, 0], 0:1)
+    y_smoothed = label_smoothing(ya, 2sf)
+    y_sim = y .* (1-2sf) .+ sf
+    y_dis = copy(y_sim)
+    y_dis[1,:], y_dis[2,:] = y_dis[2,:], y_dis[1,:]
+    @assert crossentropy(y_sim, y) < crossentropy(y_sim, y_smoothed)
+    @assert crossentropy(y_dis, y) > crossentropy(y_dis, y_smoothed)
+"""
+function label_smoothing(y::Union{AbstractArray,Number}, α::Number; dims::Int=1)
+    if !(0 < α < 1)
+        throw(ArgumentError("α must be between 0 and 1"))
+    end
+    if dims == 0
+        y_smoothed = y .* (1 - α) .+ α*1//2
+    elseif dims == 1
+        y_smoothed = y .* (1 - α) .+ α* 1 // size(y, 1)
+    else
+        throw(ArgumentError("`dims` should be either 0 or 1"))
+    end
+    return y_smoothed
+end
+
 """
     crossentropy(ŷ, y; dims=1, ϵ=eps(ŷ), agg=mean)
 
@@ -54,16 +101,20 @@ calculated as
 
     agg(-sum(y .* log.(ŷ .+ ϵ); dims=dims))
 
-Cross entropy is tipically used as a loss in multi-class classification,
+Cross entropy is typically used as a loss in multi-class classification,
 in which case the labels `y` are given in a one-hot format.
 `dims` specifies the dimension (or the dimensions) containing the class probabilities.
 The prediction `ŷ` is supposed to sum to one across `dims`,
 as would be the case with the output of a [`softmax`](@ref) operation.
 
+Use [`label_smoothing`](@ref) to smooth the true labels as preprocessing before
+computing the loss.
+
 Use of [`logitcrossentropy`](@ref) is recomended over `crossentropy` for
 numerical stability.
 
-See also: [`Flux.logitcrossentropy`](@ref), [`Flux.binarycrossentropy`](@ref), [`Flux.logitbinarycrossentropy`](@ref)
+See also: [`logitcrossentropy`](@ref), [`binarycrossentropy`](@ref), [`logitbinarycrossentropy`](@ref), 
+[`label_smoothing`](@ref)
 """
 function crossentropy(ŷ, y; dims=1, agg=mean, ϵ=epseltype(ŷ))
     agg(.-sum(xlogy.(y, ŷ .+ ϵ); dims=dims))
@@ -72,15 +123,19 @@ end
 """
     logitcrossentropy(ŷ, y; dims=1, agg=mean)
 
-Return the crossentropy computed after a [`Flux.logsoftmax`](@ref) operation;
+Return the crossentropy computed after a [`logsoftmax`](@ref) operation;
 calculated as
 
     agg(.-sum(y .* logsoftmax(ŷ; dims=dims); dims=dims))
 
+Use [`label_smoothing`](@ref) to smooth the true labels as preprocessing before
+computing the loss.
+
 `logitcrossentropy(ŷ, y)` is mathematically equivalent to
-[`Flux.Losses.crossentropy(softmax(ŷ), y)`](@ref) but it is more numerically stable.
+[`crossentropy(softmax(ŷ), y)`](@ref) but it is more numerically stable.
+
 
-See also: [`Flux.Losses.crossentropy`](@ref), [`Flux.Losses.binarycrossentropy`](@ref), [`Flux.Losses.logitbinarycrossentropy`](@ref)
+See also: [`crossentropy`](@ref), [`binarycrossentropy`](@ref), [`logitbinarycrossentropy`](@ref), [`label_smoothing`](@ref)
 """
 function logitcrossentropy(ŷ, y; dims=1, agg=mean)
     agg(.-sum(y .* logsoftmax(ŷ; dims=dims); dims=dims))
@@ -97,9 +152,13 @@ The `ϵ` term provides numerical stability.
 
 Typically, the prediction `ŷ` is given by the output of a [`sigmoid`](@ref) activation.
 
+Use [`label_smoothing`](@ref) to smooth the `y` value as preprocessing before
+computing the loss.
+
 Use of `logitbinarycrossentropy` is recomended over `binarycrossentropy` for numerical stability.
 
-See also: [`Flux.Losses.crossentropy`](@ref), [`Flux.Losses.logitcrossentropy`](@ref), [`Flux.Losses.logitbinarycrossentropy`](@ref)
+See also: [`crossentropy`](@ref), [`logitcrossentropy`](@ref), [`logitbinarycrossentropy`](@ref), 
+[`label_smoothing`](@ref)
 """
 function binarycrossentropy(ŷ, y; agg=mean, ϵ=epseltype(ŷ))
     agg(@.(-xlogy(y, ŷ+ϵ) - xlogy(1-y, 1-ŷ+ϵ)))
@@ -111,10 +170,12 @@ end
     logitbinarycrossentropy(ŷ, y; agg=mean)
 
 Mathematically equivalent to
-[`Flux.binarycrossentropy(σ(ŷ), y)`](@ref) but is more numerically stable.
+[`binarycrossentropy(σ(ŷ), y)`](@ref) but is more numerically stable.
+
+Use [`label_smoothing`](@ref) to smooth the `y` value as preprocessing before
+computing the loss.
 
-See also: [`Flux.Losses.crossentropy`](@ref), [`Flux.Losses.logitcrossentropy`](@ref), [`Flux.Losses.binarycrossentropy`](@ref)
-```
+See also: [`crossentropy`](@ref), [`logitcrossentropy`](@ref), [`binarycrossentropy`](@ref), [`label_smoothing`](@ref)
 """
 function logitbinarycrossentropy(ŷ, y; agg=mean)
     agg(@.((1-y)*ŷ - logσ(ŷ)))
diff --git a/test/losses.jl b/test/losses.jl
@@ -1,7 +1,7 @@
 using Test
 using Flux: onehotbatch, σ
 
-using Flux.Losses: mse, crossentropy, logitcrossentropy, binarycrossentropy, logitbinarycrossentropy
+using Flux.Losses: mse, label_smoothing, crossentropy, logitcrossentropy, binarycrossentropy, logitbinarycrossentropy
 using Flux.Losses: xlogx, xlogy
 
 # group here all losses, used in tests
@@ -56,31 +56,69 @@ end
 
 # Now onehot y's
 y = onehotbatch([1, 1, 0, 0], 0:1)
+y_smoothed = label_smoothing(y, 0.1)
 ŷ = [.1 .9; .9 .1; .9 .1; .1 .9]'
 v = log(.1 / .9)
 logŷ = [v 0.0; 0.0 v; 0.0 v; v 0.0]'
 lossvalue = 1.203972804325936
+lossvalue_smoothed = 1.2039728043259348
+yl = onehotbatch([1], 0:1)
+sf = 0.1
+yls = [sf (1-sf)]'  # Effective y after label smoothing
+ylp = [0.9 0.1]'
+logylp = [0.0 v]'
+
+# Construct `sim`ilar and `dis`imilar versions of the dataset so we can test effect of smoothing
+# smoothing should decrease loss on disimilar and increase the loss on similar, compared to
+# the loss without smoothing
+ya = onehotbatch([1, 1, 1, 0, 0], 0:1)
+ya_smoothed = label_smoothing(ya, 2sf)
+y_same = Float32.(ya)
+y_sim = y_same .* (1-2*sf) .+ sf
+y_dis = copy(y_sim)
+y_dis[1,:], y_dis[2,:] = y_dis[2,:], y_dis[1,:]
 
 @testset "crossentropy" begin
   @test crossentropy([0.1,0.0,0.9], [0.1,0.0,0.9]) ≈ crossentropy([0.1,0.9], [0.1,0.9])
   @test crossentropy(ŷ, y) ≈ lossvalue
+  @test crossentropy(ŷ, y_smoothed) ≈ lossvalue_smoothed
+  @test crossentropy(ylp, label_smoothing(yl, 2sf)) ≈ -sum(yls.*log.(ylp))
+  @test crossentropy(ylp, yl) ≈ -sum(yl.*log.(ylp))
+  @test iszero(crossentropy(y_same, ya, ϵ=0))
+  @test iszero(crossentropy(ya, ya, ϵ=0))
+  @test crossentropy(y_sim, ya) < crossentropy(y_sim, ya_smoothed)
+  @test crossentropy(y_dis, ya) > crossentropy(y_dis, ya_smoothed)
 end
 
 @testset "logitcrossentropy" begin
   @test logitcrossentropy(logŷ, y) ≈ lossvalue
+  @test logitcrossentropy(logylp, yl) ≈ -sum(yl.*logsoftmax(logylp))
+  @test logitcrossentropy(logylp, label_smoothing(yl, 2sf)) ≈ -sum(yls.*logsoftmax(logylp))
 end
 
 logŷ, y = randn(3), rand(3)
+yls = y.*(1-2sf).+sf
 
 @testset "binarycrossentropy" begin
+  @test binarycrossentropy.(σ.(logŷ), label_smoothing(y, 2sf; dims=0); ϵ=0) ≈ -yls.*log.(σ.(logŷ)) - (1 .- yls).*log.(1 .- σ.(logŷ))
   @test binarycrossentropy(σ.(logŷ), y; ϵ=0) ≈ mean(-y.*log.(σ.(logŷ)) - (1 .- y).*log.(1 .- σ.(logŷ)))
   @test binarycrossentropy(σ.(logŷ), y) ≈ mean(-y.*log.(σ.(logŷ) .+ eps.(σ.(logŷ))) - (1 .- y).*log.(1 .- σ.(logŷ) .+ eps.(σ.(logŷ))))
 end
 
 @testset "logitbinarycrossentropy" begin
+  @test logitbinarycrossentropy.(logŷ, label_smoothing(y, 0.2)) ≈ binarycrossentropy.(σ.(logŷ), label_smoothing(y, 0.2); ϵ=0)
   @test logitbinarycrossentropy(logŷ, y) ≈ binarycrossentropy(σ.(logŷ), y; ϵ=0)
 end
 
+y = onehotbatch([1], 0:1)
+yls = [0.1 0.9]'
+@testset "label_smoothing" begin
+  @test label_smoothing(y, 0.2) == yls
+  @test label_smoothing(y, 0.2; dims=0) == label_smoothing.(y, 0.2; dims=0)
+  @test_throws ArgumentError label_smoothing([0., 0., 1., 0.], 1.2)
+  @test_throws ArgumentError label_smoothing([0., 0., 1., 0.], 0.)
+end
+
 y = [1 2 3]
 ŷ = [4.0 5.0 6.0]