Fixing extended trace failure for Adam and AdaMax and generalising alpha parameter to accept callable object (scheduler) (#1115)

kishore-nori · web-flow · commit 591d128fe2dc · 2025-01-23T13:59:39.000+01:00
* adding alpha to state and generalizing alpha to accept a function (scheduler)

* removing unused variables

* adding alpha to AdaMax state and generalizing alpha to accept a function (scheduler)

* updating the docstring for Adam to add description of scheduled alpha constructors

* updating the docstring for AdaMax to add description of scheduled alpha constructors

* adding tests for scheduled Adam and AdaMax, which covers testing extended_trace=true case

* adding default constant alpha case tests for extended_trace=true for Adam and AdaMax
diff --git a/src/multivariate/solvers/first_order/adam.jl b/src/multivariate/solvers/first_order/adam.jl
@@ -1,17 +1,37 @@
 """
 # Adam
-## Constructor
+## Constant `alpha` case (default) constructor:
+
 ```julia
     Adam(; alpha=0.0001, beta_mean=0.9, beta_var=0.999, epsilon=1e-8)
 ```
+
+## Scheduled `alpha` case constructor:
+
+Alternative to the above (default) usage where `alpha` is a fixed constant for
+all the iterations, the following constructor provides flexibility for `alpha`
+to be a callable object (a scheduler) that maps the current iteration count to
+a value of `alpha` that is to-be used for the current optimization iteraion's
+update step. This helps us in scheduling `alpha` over the iterations as
+desired, using the following usage,
+
+```julia
+    # Let alpha_scheduler be iteration -> alpha value mapping callable object
+    Adam(; alpha=alpha_scheduler, other_kwargs...)
+```
+
 ## Description
-Adam is a gradient based optimizer that choses its search direction by building up estimates of the first two moments of the gradient vector. This makes it suitable for problems with a stochastic objective and thus gradient. The method is introduced in [1] where the related AdaMax method is also introduced, see `?AdaMax` for more information on that method.
+Adam is a gradient based optimizer that choses its search direction by building
+up estimates of the first two moments of the gradient vector. This makes it
+suitable for problems with a stochastic objective and thus gradient. The method
+is introduced in [1] where the related AdaMax method is also introduced, see
+`?AdaMax` for more information on that method.
 
 ## References
 [1] https://arxiv.org/abs/1412.6980
 """
-struct Adam{T, Tm} <: FirstOrderOptimizer
-    α::T
+struct Adam{Tα, T, Tm} <: FirstOrderOptimizer
+    α::Tα  
     β₁::T
     β₂::T
     ϵ::T
@@ -32,20 +52,29 @@ mutable struct AdamState{Tx, T, Tm, Tu, Ti} <: AbstractOptimizerState
     s::Tx
     m::Tm
     u::Tu
+    alpha::T
     iter::Ti
 end
 function reset!(method, state::AdamState, obj, x)
     value_gradient!!(obj, x)
 end
+
+function _get_init_params(method::Adam{T}) where T <: Real
+  method.α, method.β₁, method.β₂
+end 
+
+function _get_init_params(method::Adam)
+  method.α(1), method.β₁, method.β₂
+end 
+
 function initial_state(method::Adam, options, d, initial_x::AbstractArray{T}) where T
     initial_x = copy(initial_x)
 
     value_gradient!!(d, initial_x)
-    α, β₁, β₂ = method.α, method.β₁, method.β₂
+    α, β₁, β₂ = _get_init_params(method)
 
     m = copy(gradient(d))
     u = zero(m)
-    a = 1 - β₁
     iter = 0
 
     AdamState(initial_x, # Maintain current state in state.x
@@ -54,13 +83,29 @@ function initial_state(method::Adam, options, d, initial_x::AbstractArray{T}) wh
                          similar(initial_x), # Maintain current search direction in state.s
                          m,
                          u,
+                         α,
                          iter)
 end
 
+function _update_iter_alpha_in_state!(
+  state::AdamState, method::Adam{T}) where T <: Real
+
+  state.iter = state.iter+1
+end 
+
+function _update_iter_alpha_in_state!(
+  state::AdamState, method::Adam)
+
+  state.iter = state.iter+1
+  state.alpha = method.α(state.iter)
+end
+
 function update_state!(d, state::AdamState{T}, method::Adam) where T
-    state.iter = state.iter+1
+    
+    _update_iter_alpha_in_state!(state, method)
     value_gradient!(d, state.x)
-    α, β₁, β₂, ϵ = method.α, method.β₁, method.β₂, method.ϵ
+
+    α, β₁, β₂, ϵ = state.alpha, method.β₁, method.β₂, method.ϵ
     a = 1 - β₁
     b = 1 - β₂
 
diff --git a/src/multivariate/solvers/first_order/adamax.jl b/src/multivariate/solvers/first_order/adamax.jl
@@ -1,18 +1,37 @@
 """
 # AdaMax
-## Constructor
+## Constant `alpha` case (default) constructor:
+
 ```julia
     AdaMax(; alpha=0.002, beta_mean=0.9, beta_var=0.999, epsilon=1e-8)
 ```
-## Description
-AdaMax is a gradient based optimizer that choses its search direction by building up estimates of the first two moments of the gradient vector. This makes it suitable for problems with a stochastic objective and thus gradient. The method is introduced in [1] where the related Adam method is also introduced, see `?Adam` for more information on that method.
 
+## Scheduled `alpha` case constructor:
+
+Alternative to the above (default) usage where `alpha` is a fixed constant for
+all the iterations, the following constructor provides flexibility for `alpha`
+to be a callable object (a scheduler) that maps the current iteration count to
+a value of `alpha` that is to-be used for the current optimization iteraion's
+update step. This helps us in scheduling `alpha` over the iterations as
+desired, using the following usage,
 
+```julia
+    # Let alpha_scheduler be iteration -> alpha value mapping callable object
+    AdaMax(; alpha=alpha_scheduler, other_kwargs...)
+```
+
+## Description
+AdaMax is a gradient based optimizer that choses its search direction by
+building up estimates of the first two moments of the gradient vector. This
+makes it suitable for problems with a stochastic objective and thus gradient.
+The method is introduced in [1] where the related Adam method is also
+introduced, see `?Adam` for more information on that method.
+
+## References
 [1] https://arxiv.org/abs/1412.6980
 """
-
-struct AdaMax{T,Tm} <: FirstOrderOptimizer
-    α::T
+struct AdaMax{Tα, T, Tm} <: FirstOrderOptimizer
+    α::Tα  
     β₁::T
     β₂::T
     ϵ::T
@@ -33,20 +52,29 @@ mutable struct AdaMaxState{Tx, T, Tm, Tu, Ti} <: AbstractOptimizerState
     s::Tx
     m::Tm
     u::Tu
+    alpha::T
     iter::Ti
 end
 function reset!(method, state::AdaMaxState, obj, x)
     value_gradient!!(obj, x)
 end
+
+function _get_init_params(method::AdaMax{T}) where T <: Real
+  method.α, method.β₁, method.β₂
+end 
+
+function _get_init_params(method::AdaMax)
+  method.α(1), method.β₁, method.β₂
+end 
+
 function initial_state(method::AdaMax, options, d, initial_x::AbstractArray{T}) where T
     initial_x = copy(initial_x)
 
     value_gradient!!(d, initial_x)
-    α, β₁, β₂ = method.α, method.β₁, method.β₂
+    α, β₁, β₂ = _get_init_params(method)
 
     m = copy(gradient(d))
     u = zero(m)
-    a = 1 - β₁
     iter = 0
 
     AdaMaxState(initial_x, # Maintain current state in state.x
@@ -55,13 +83,27 @@ function initial_state(method::AdaMax, options, d, initial_x::AbstractArray{T})
                          similar(initial_x), # Maintain current search direction in state.s
                          m,
                          u,
+                         α,
                          iter)
 end
 
+function _update_iter_alpha_in_state!(
+  state::AdaMaxState, method::AdaMax{T}) where T <: Real
+
+  state.iter = state.iter+1
+end 
+
+function _update_iter_alpha_in_state!(
+  state::AdaMaxState, method::AdaMax)
+
+  state.iter = state.iter+1
+  state.alpha = method.α(state.iter)
+end
+
 function update_state!(d, state::AdaMaxState{T}, method::AdaMax) where T
-    state.iter = state.iter+1
+    _update_iter_alpha_in_state!(state, method)
     value_gradient!(d, state.x)
-    α, β₁, β₂, ϵ = method.α, method.β₁, method.β₂, method.ϵ
+    α, β₁, β₂, ϵ = state.alpha, method.β₁, method.β₂, method.ϵ
     a = 1 - β₁
     m, u = state.m, state.u
 
diff --git a/test/multivariate/solvers/first_order/adam_adamax.jl b/test/multivariate/solvers/first_order/adam_adamax.jl
@@ -21,6 +21,7 @@
                     skip = skip,
                     show_name = debug_printing)
 end
+
 @testset "AdaMax" begin
     f(x) = x[1]^4
     function g!(storage, x)
@@ -45,3 +46,61 @@ end
                     show_name=debug_printing,
                     iteration_exceptions = (("Trigonometric", 1_000_000,),))
 end
+
+@testset "Adam-scheduler" begin
+  f(x) = x[1]^4
+  function g!(storage, x)
+      storage[1] = 4 * x[1]^3
+      return
+  end
+
+  initial_x = [1.0]
+  options = Optim.Options(show_trace = debug_printing, allow_f_increases=true, iterations=100_000)
+  alpha_scheduler(iter) = 0.0001*(1 + 0.99^iter)
+  results = Optim.optimize(f, g!, initial_x, Adam(alpha=alpha_scheduler), options)
+  @test norm(Optim.minimum(results)) < 1e-6
+  @test summary(results) == "Adam"
+
+  # verifying the alpha values over iterations and also testing extended_trace
+  # this way we test both alpha scheduler and the working of
+  # extended_trace=true option
+
+  options = Optim.Options(show_trace = debug_printing, allow_f_increases=true, iterations=1000, extended_trace=true, store_trace=true)
+  results = Optim.optimize(f, g!, initial_x, Adam(alpha=1e-5), options)
+
+  @test prod(map(iter -> results.trace[iter].metadata["Current step size"], 2:results.iterations+1) .== 1e-5)
+
+  options = Optim.Options(show_trace = debug_printing, allow_f_increases=true, iterations=1000, extended_trace=true, store_trace=true)
+  results = Optim.optimize(f, g!, initial_x, Adam(alpha=alpha_scheduler), options)
+
+  @test map(iter -> results.trace[iter].metadata["Current step size"], 2:results.iterations+1) == alpha_scheduler.(1:results.iterations)
+end
+
+@testset "AdaMax-scheduler" begin
+  f(x) = x[1]^4
+  function g!(storage, x)
+      storage[1] = 4 * x[1]^3
+      return
+  end
+
+  initial_x = [1.0]
+  options = Optim.Options(show_trace = debug_printing, allow_f_increases=true, iterations=100_000)
+  alpha_scheduler(iter) = 0.002*(1 + 0.99^iter)
+  results = Optim.optimize(f, g!, initial_x, AdaMax(alpha=alpha_scheduler), options)
+  @test norm(Optim.minimum(results)) < 1e-6
+  @test summary(results) == "AdaMax"
+
+  # verifying the alpha values over iterations and also testing extended_trace
+  # this way we test both alpha scheduler and the working of
+  # extended_trace=true option
+
+  options = Optim.Options(show_trace = debug_printing, allow_f_increases=true, iterations=1000, extended_trace=true, store_trace=true)
+  results = Optim.optimize(f, g!, initial_x, AdaMax(alpha=1e-4), options)
+
+  @test prod(map(iter -> results.trace[iter].metadata["Current step size"], 2:results.iterations+1) .== 1e-4)
+
+  options = Optim.Options(show_trace = debug_printing, allow_f_increases=true, iterations=1000, extended_trace=true, store_trace=true)
+  results = Optim.optimize(f, g!, initial_x, AdaMax(alpha=alpha_scheduler), options)
+
+  @test map(iter -> results.trace[iter].metadata["Current step size"], 2:results.iterations+1) == alpha_scheduler.(1:results.iterations)
+end