Remove trailing whitespace (#1108)

timholy · web-flow · commit 32860fd1cdff · 2024-10-28T10:18:33.000-05:00
No functional changes
diff --git a/docs/src/examples/maxlikenlm.jl b/docs/src/examples/maxlikenlm.jl
@@ -196,13 +196,13 @@ parameters = Optim.minimizer(opt)
 numerical_hessian = hessian!(func,parameters)
 
 # Let's find the estimated value of σ, rather than log σ, and it's standard error
-# To do this, we will use the Delta Method: https://en.wikipedia.org/wiki/Delta_method 
+# To do this, we will use the Delta Method: https://en.wikipedia.org/wiki/Delta_method
 
 # this function exponetiates log σ
 function transform(parameters)
     parameters[end] = exp(parameters[end])
     parameters
-end    
+end
 
 # get the Jacobian of the transformation
 J = ForwardDiff.jacobian(transform, parameters)'
diff --git a/docs/src/examples/rasch.jl b/docs/src/examples/rasch.jl
@@ -11,13 +11,13 @@ using Optim, Random #hide
 # assessment data such as student responses to a standardized
 # test. Let $X_{pi}$ be the response accuracy of student $p$
 # to item $i$ where $X_{pi}=1$ if the item was answered correctly
-# and $X_{pi}=0$ otherwise for $p=1,\ldots,n$ and $i=1,\ldots,m$. 
-# The model for this accuracy is 
+# and $X_{pi}=0$ otherwise for $p=1,\ldots,n$ and $i=1,\ldots,m$.
+# The model for this accuracy is
 # ```math
 #   P(\mathbf{X}_{p}=\mathbf{x}_{p}|\xi_p, \mathbf\epsilon) = \prod_{i=1}^m \dfrac{(\xi_p \epsilon_j)^{x_{pi}}}{1 + \xi_p\epsilon_i}
 # ```
-# where $\xi_p > 0$ the latent ability of person $p$ and $\epsilon_i > 0$ 
-# is the difficulty of item $i$. 
+# where $\xi_p > 0$ the latent ability of person $p$ and $\epsilon_i > 0$
+# is the difficulty of item $i$.
 
 # We simulate data from this model:
 
@@ -28,9 +28,9 @@ theta = randn(n)
 delta = randn(m)
 r = zeros(n)
 s = zeros(m)
-  
+
 for i in 1:n
-  p = exp.(theta[i] .- delta) ./ (1.0 .+ exp.(theta[i] .- delta))  
+  p = exp.(theta[i] .- delta) ./ (1.0 .+ exp.(theta[i] .- delta))
   for j in 1:m
     if rand() < p[j] ##correct
       r[i] += 1
@@ -40,9 +40,9 @@ for i in 1:n
 end
 f = [sum(r.==j) for j in 1:m];
 
-# Since the number of parameters increases 
-# with sample size standard maximum likelihood will not provide us 
-# consistent estimates. Instead we consider the conditional likelihood. 
+# Since the number of parameters increases
+# with sample size standard maximum likelihood will not provide us
+# consistent estimates. Instead we consider the conditional likelihood.
 # It can be shown that the Rasch model is an exponential family model and
 # that the sum score $r_p = \sum_{i} x_{pi}$ is the sufficient statistic for
 # $\xi_p$. If we condition on the sum score we should be able to eliminate
@@ -55,7 +55,7 @@ f = [sum(r.==j) for j in 1:m];
 # \gamma_r(\mathbf\epsilon) = \sum_{\mathbf{y} : \mathbf{1}^\intercal \mathbf{y} = r} \prod_{j=1}^m \epsilon_j^{y_j}
 # ```
 # where the sum is over all possible answer configurations that give a sum
-# score of $r$. Algorithms to efficiently compute $\gamma$ and its 
+# score of $r$. Algorithms to efficiently compute $\gamma$ and its
 # derivatives are available in the literature (see eg Baker (1996) for a review
 # and Biscarri (2018) for a more modern approach)
 
@@ -65,7 +65,7 @@ function esf_sum!(S::AbstractArray{T,1}, x::AbstractArray{T,1}) where T <: Real
   S[1] = one(T)
   @inbounds for col in 1:n
     for r in 1:col
-      row = col - r + 1 
+      row = col - r + 1
       S[row+1] = S[row+1] + x[col] * S[row]
     end
   end
@@ -114,20 +114,20 @@ function neglogLC(β)
   return -s'log.(ϵ) + f'log.(S[2:end])
 end
 
-# Parameter estimation is usually performed with respect to the unconstrained parameter 
-# $\beta_i = -\log{\epsilon_i}$. Taking the derivative with respect to $\beta_i$ 
-# (and applying the chain rule) one obtains 
+# Parameter estimation is usually performed with respect to the unconstrained parameter
+# $\beta_i = -\log{\epsilon_i}$. Taking the derivative with respect to $\beta_i$
+# (and applying the chain rule) one obtains
 # ```math
-#   \dfrac{\partial\log L_C(\mathbf\epsilon|\mathbf{r})}{\partial \beta_i} = -s_i + \epsilon_i\sum_{r=1}^m \dfrac{f_r \gamma_{r-1}^{(j)}}{\gamma_r} 
+#   \dfrac{\partial\log L_C(\mathbf\epsilon|\mathbf{r})}{\partial \beta_i} = -s_i + \epsilon_i\sum_{r=1}^m \dfrac{f_r \gamma_{r-1}^{(j)}}{\gamma_r}
 # ```
-# where $\gamma_{r-1}^{(i)} = \partial \gamma_{r}(\mathbf\epsilon)/\partial\epsilon_i$. 
+# where $\gamma_{r-1}^{(i)} = \partial \gamma_{r}(\mathbf\epsilon)/\partial\epsilon_i$.
 
 function g!(storage, β)
   calculate_common!(β, last_β)
   for j in 1:m
     storage[j] = s[j]
     for l in 1:m
-      storage[j] -= ϵ[j] * f[l] * (H[j,j,l+1] / S[l+1]) 
+      storage[j] -= ϵ[j] * f[l] * (H[j,j,l+1] / S[l+1])
     end
   end
 end
@@ -147,25 +147,25 @@ function h!(storage, β)
       storage[k,j] = 0.0
       for l in 1:m
         if j == k
-          storage[j,j] += f[l] * (ϵ[j]*H[j,j,l+1] / S[l+1]) * 
+          storage[j,j] += f[l] * (ϵ[j]*H[j,j,l+1] / S[l+1]) *
             (1 - ϵ[j]*H[j,j,l+1] / S[l+1])
         elseif k > j
-          storage[k,j] += ϵ[j] * ϵ[k] * f[l] * 
+          storage[k,j] += ϵ[j] * ϵ[k] * f[l] *
             ((H[k,j,l] / S[l+1]) - (H[j,j,l+1] * H[k,k,l+1]) / S[l+1] ^ 2)
         else #k < j
-          storage[k,j] += ϵ[j] * ϵ[k] * f[l] * 
+          storage[k,j] += ϵ[j] * ϵ[k] * f[l] *
             ((H[j,k,l] / S[l+1]) - (H[j,j,l+1] * H[k,k,l+1]) / S[l+1] ^ 2)
         end
       end
     end
   end
 end
 
-# The estimates of the item parameters are then obtained via standard optimization 
-# algorithms (either Newton-Raphson or L-BFGS). One last issue is that the model is 
-# not identifiable (multiplying the $\xi_p$ by a constant and dividing the $\epsilon_i$ 
-# by the same constant results in the same likelihood). Therefore some kind of constraint 
-# must be imposed when estimating the parameters. Typically either $\epsilon_1 = 0$ or 
+# The estimates of the item parameters are then obtained via standard optimization
+# algorithms (either Newton-Raphson or L-BFGS). One last issue is that the model is
+# not identifiable (multiplying the $\xi_p$ by a constant and dividing the $\epsilon_i$
+# by the same constant results in the same likelihood). Therefore some kind of constraint
+# must be imposed when estimating the parameters. Typically either $\epsilon_1 = 0$ or
 # $\prod_{i=1}^m \epsilon_i = 1$ (which is equivalent to $\sum_{i=1}^m \beta_i = 0$).
 
 con_c!(c, x) = (c[1] = sum(x); c)
diff --git a/docs/src/user/algochoice.md b/docs/src/user/algochoice.md
@@ -6,7 +6,7 @@ There are two main settings you must choose in Optim: the algorithm and the line
 
 The first choice to be made is that of the order of the method. Zeroth-order methods do not have gradient information, and are very slow to converge, especially in high dimension. First-order methods do not have access to curvature information and can take a large number of iterations to converge for badly conditioned problems. Second-order methods can converge very quickly once in the vicinity of a minimizer. Of course, this enhanced performance comes at a cost: the objective function has to be differentiable, you have to supply gradients and Hessians, and, for second order methods, a linear system has to be solved at each step.
 
-If you can provide analytic gradients and Hessians, and the dimension of the problem is not too large, then second order methods are very efficient. The Newton method with trust region is the method of choice. 
+If you can provide analytic gradients and Hessians, and the dimension of the problem is not too large, then second order methods are very efficient. The Newton method with trust region is the method of choice.
 
 When you do not have an explicit Hessian or when the dimension becomes large enough that the linear solve in the Newton method becomes the bottleneck, first order methods should be preferred. BFGS is a very efficient method, but also requires a linear system solve. LBFGS usually has a performance very close to that of BFGS, and avoids linear system solves (the parameter `m` can be tweaked: increasing it can improve the convergence, at the expense of memory and time spent in linear algebra operations). The conjugate gradient method usually converges less quickly than LBFGS, but requires less memory. Gradient descent should only be used for testing. Acceleration methods are experimental.
 
diff --git a/docs/src/user/config.md b/docs/src/user/config.md
@@ -52,7 +52,7 @@ In addition to the solver, you can alter the behavior of the Optim package by us
 * `show_warnings`: Should warnings due to NaNs or Inf be shown? Defaults to `true`.
 * `trace_simplex`: Include the full simplex in the trace for `NelderMead`. Defaults to `false`.
 * `show_every`: Trace output is printed every `show_every`th iteration.
-* `callback`: A function to be called during tracing. A return value of `true` stops the `optimize` call. The callback function is called every `show_every`th iteration. If `store_trace` is false, the argument to the callback is of the type  [`OptimizationState`](https://github.com/JuliaNLSolvers/Optim.jl/blob/a1035134ca1f3ebe855f1cde034e32683178225a/src/types.jl#L155), describing the state of the current iteration. If `store_trace` is true, the argument is a list of all the states from the first iteration to the current. 
+* `callback`: A function to be called during tracing. A return value of `true` stops the `optimize` call. The callback function is called every `show_every`th iteration. If `store_trace` is false, the argument to the callback is of the type  [`OptimizationState`](https://github.com/JuliaNLSolvers/Optim.jl/blob/a1035134ca1f3ebe855f1cde034e32683178225a/src/types.jl#L155), describing the state of the current iteration. If `store_trace` is true, the argument is a list of all the states from the first iteration to the current.
 * `time_limit`: A soft upper limit on the total run time. Defaults to `NaN` (unlimited).
 
 Box constrained optimization has additional keywords to alter the behavior of the outer solver:
diff --git a/joss/paper.bib b/joss/paper.bib
@@ -92,7 +92,7 @@ @article{damle2018variational
      year = 2018,
     month = jan,
    adsurl = {http://adsabs.harvard.edu/abs/2018arXiv180108572D},
-  adsnote = {Provided by the SAO/NASA Astrophysics Data System},  
+  adsnote = {Provided by the SAO/NASA Astrophysics Data System},
 }
 
 @article{dony2018parametric,
@@ -119,7 +119,7 @@ @article{rackauckas2017differentialequations
 }
 
 @article{regier2016celeste,
-author = {{Regier}, J. and {Pamnany}, K. and {Giordano}, R. and {Thomas}, R. and 
+author = {{Regier}, J. and {Pamnany}, K. and {Giordano}, R. and {Thomas}, R. and
 	{Schlegel}, D. and {McAuliffe}, J. and {Prabhat}},
     title = "{Learning an Astronomical Catalog of the Visible Universe through Scalable Bayesian Inference}",
   journal = {ArXiv e-prints},
diff --git a/src/multivariate/solvers/constrained/fminbox.jl b/src/multivariate/solvers/constrained/fminbox.jl
@@ -31,11 +31,11 @@ function in_box(bb::BoxBarrier, x)
     all(x->x[1]>=x[2] && x[1]<=x[3], zip(x, bb.lower, bb.upper))
 end
 in_box(bw::BarrierWrapper, x) = in_box(bw.b, x)
-# evaluates the value and gradient components comming from the log barrier 
+# evaluates the value and gradient components comming from the log barrier
 function _barrier_term_value(x::T, l, u) where T
     dxl = x - l
     dxu = u - x
-    
+
     if dxl <= 0 || dxu <= 0
         return T(Inf)
     end
@@ -53,7 +53,7 @@ function _barrier_term_gradient(x::T, l, u) where T
     if isfinite(u)
         g += one(T)/dxu
     end
-    return g 
+    return g
 end
 function value_gradient!(bb::BoxBarrier, g, x)
     g .= _barrier_term_gradient.(x, bb.lower, bb.upper)
@@ -134,7 +134,7 @@ end
 # position, the gradient of the input function should dominate the
 # gradient of the barrier.
 function initial_mu(gfunc::AbstractArray{T}, gbarrier::AbstractArray{T}, mu0factor::T = T(1)/1000, mu0::T = convert(T, NaN)) where T
-    
+
     if isnan(mu0)
         gbarriernorm = sum(abs, gbarrier)
         if gbarriernorm > 0
@@ -378,7 +378,7 @@ function optimize(
         # we need to update the +mu*barrier_grad part. Since we're using the
         # value_gradient! not !! as in initial_state, we won't make a superfluous
         # evaluation
-        
+
         if !(F.method isa NelderMead)
             value_gradient!(dfbox, x)
         else
@@ -412,7 +412,7 @@ function optimize(
             println()
             println("Decreasing barrier term μ.\n")
         end
-        
+
         # Decrease mu
         dfbox.mu *= T(F.mufactor)
         # Test for convergence
@@ -429,7 +429,7 @@ function optimize(
         stopped_by_time_limit = _time-t0 > options.time_limit ? true : false
         stopped = stopped_by_time_limit
     end
-    
+
     stopped_by =(#f_limit_reached=f_limit_reached,
                  #g_limit_reached=g_limit_reached,
                  #h_limit_reached=h_limit_reached,
diff --git a/src/multivariate/solvers/constrained/ipnewton/interior.jl b/src/multivariate/solvers/constrained/ipnewton/interior.jl
@@ -203,7 +203,7 @@ end
 function optimize(d, lower::AbstractArray, upper::AbstractArray, initial_x::AbstractArray, method::ConstrainedOptimizer,
                   options::Options = Options(;default_options(method)...))
     twicediffed = d isa TwiceDifferentiable ? d : TwiceDifferentiable(d, initial_x)
-    
+
     bounds = ConstraintBounds(lower, upper, [], [])
     constraints = TwiceDifferentiableConstraints(
             (c,x)->nothing, (J,x)->nothing, (H,x,λ)->nothing, bounds)
diff --git a/src/multivariate/solvers/constrained/samin.jl b/src/multivariate/solvers/constrained/samin.jl
@@ -258,7 +258,7 @@ function optimize(obj_fn, lb::AbstractArray, ub::AbstractArray, x::AbstractArray
             # last value close enough to last neps values?
             fstar[1] = f_old
             f_absΔ = abs.(fopt - f_old) # close enough to best so far?
-            if all((abs.(fopt .- fstar)) .< f_tol) # within to for last neps trials? 
+            if all((abs.(fopt .- fstar)) .< f_tol) # within to for last neps trials?
                 f_converged = true
                 # check for bound narrow enough for parameter convergence
                 if any(bounds .> x_tol)
diff --git a/src/multivariate/solvers/first_order/adam.jl b/src/multivariate/solvers/first_order/adam.jl
@@ -42,7 +42,7 @@ function initial_state(method::Adam, options, d, initial_x::AbstractArray{T}) wh
 
     value_gradient!!(d, initial_x)
     α, β₁, β₂ = method.α, method.β₁, method.β₂
-   
+
     m = copy(gradient(d))
     u = zero(m)
     a = 1 - β₁
diff --git a/src/multivariate/solvers/first_order/adamax.jl b/src/multivariate/solvers/first_order/adamax.jl
@@ -43,7 +43,7 @@ function initial_state(method::AdaMax, options, d, initial_x::AbstractArray{T})
 
     value_gradient!!(d, initial_x)
     α, β₁, β₂ = method.α, method.β₁, method.β₂
-   
+
     m = copy(gradient(d))
     u = zero(m)
     a = 1 - β₁
diff --git a/src/multivariate/solvers/first_order/cg.jl b/src/multivariate/solvers/first_order/cg.jl
@@ -111,7 +111,7 @@ function reset!(cg, cgs::ConjugateGradientState, obj, x)
     if cg.P !== nothing
         project_tangent!(cg.manifold, cgs.pg, x)
     end
-    cgs.s .= -cgs.pg 
+    cgs.s .= -cgs.pg
     cgs.f_x_previous = typeof(cgs.f_x_previous)(NaN)
 end
 function initial_state(method::ConjugateGradient, options, d, initial_x)
diff --git a/src/univariate/optimize/interface.jl b/src/univariate/optimize/interface.jl
@@ -35,7 +35,7 @@ function optimize(f,
     lower::Union{Integer, Real},
     upper::Union{Integer, Real};
     kwargs...)
-     
+
     T = promote_type(typeof(lower/1), typeof(upper/1))
     optimize(f,
              T(lower),
@@ -48,7 +48,7 @@ function optimize(f,
     upper::Union{Integer, Real},
     method::Union{Brent, GoldenSection};
     kwargs...)
-     
+
     T = promote_type(typeof(lower/1), typeof(upper/1))
     optimize(f,
              T(lower),
diff --git a/src/univariate/printing.jl b/src/univariate/printing.jl
@@ -43,7 +43,7 @@ function Base.show(io::IO, r::UnivariateOptimizationResults)
     status_string = ""
     if time_run(r) > time_limit(r)
         status_string *= "failure (exceeded time limit of $(time_limit(r)))"
-    else 
+    else
         status_string = "success"
     end
 
diff --git a/src/univariate/solvers/golden_section.jl b/src/univariate/solvers/golden_section.jl
@@ -130,7 +130,7 @@ function optimize(f, x_lower::T, x_upper::T,
                                          rel_tol,
                                          abs_tol,
                                          tr,
-                                         f_calls, 
+                                         f_calls,
                                          time_limit,
                                          _time)
 end
diff --git a/test/general/objective_types.jl b/test/general/objective_types.jl
@@ -13,12 +13,12 @@
             @test Optim.gradient(odad2) == [0.0]
             #    @test odad3.g == [0.0]
         end
-        
+
         for a in (1.0, 5.0)
             xa = rand(1)
             odad1 = OnceDifferentiable(x->a*x[1], xa; autodiff = :finite)
             odad2 = OnceDifferentiable(x->a*x[1], xa; autodiff = :forward)
-        #    odad3 = OnceDifferentiable(x->a*x[1], xa; autodiff = :reverse)        
+        #    odad3 = OnceDifferentiable(x->a*x[1], xa; autodiff = :reverse)
             Optim.gradient!(odad1, xa)
             Optim.gradient!(odad2, xa)
             @test Optim.gradient(odad1) ≈ [a]
diff --git a/test/multivariate/measurements.jl b/test/multivariate/measurements.jl
@@ -9,5 +9,5 @@ import Measurements
     #given an initial value, they should give the exact same answer
     @test all(Optim.minimizer(resmes) .|> Measurements.value .== Optim.minimizer(resfloat))
     @test Optim.minimum(resmes) .|> Measurements.value .== Optim.minimum(resfloat)
-    
+
 end
diff --git a/test/multivariate/optimize/optimize.jl b/test/multivariate/optimize/optimize.jl
@@ -53,7 +53,7 @@
 end
 
 @testset "#718" begin
-    
+
     f(x) = (1.0 - x[1])^2 + 100.0 * (x[2] - x[1]^2)^2
     function g!(G, x)
       G[1] = -2.0 * (1.0 - x[1]) - 400.0 * (x[2] - x[1]^2) * x[1]
diff --git a/test/multivariate/solvers/first_order/adam_adamax.jl b/test/multivariate/solvers/first_order/adam_adamax.jl
@@ -14,8 +14,8 @@
     # TODO: Check why skip problems fail
     skip = ("Large Polynomial", "Parabola", "Paraboloid Random Matrix",
             "Paraboloid Diagonal", "Penalty Function I", "Polynomial", "Powell",
-             "Extended Powell", "Trigonometric", "Himmelblau", "Rosenbrock", "Extended Rosenbrock", 
-             "Quadratic Diagonal", "Beale", "Fletcher-Powell", "Exponential", 
+             "Extended Powell", "Trigonometric", "Himmelblau", "Rosenbrock", "Extended Rosenbrock",
+             "Quadratic Diagonal", "Beale", "Fletcher-Powell", "Exponential",
              )
     run_optim_tests(Adam();
                     skip = skip,
@@ -38,7 +38,7 @@ end
     skip = ("Trigonometric", "Large Polynomial", "Parabola", "Paraboloid Random Matrix",
             "Paraboloid Diagonal", "Extended Rosenbrock", "Penalty Function I", "Beale",
             "Extended Powell", "Himmelblau", "Large Polynomial", "Polynomial", "Powell",
-            "Exponential", 
+            "Exponential",
              )
     run_optim_tests(AdaMax();
                     skip = skip,
diff --git a/test/univariate/solvers/brent.jl b/test/univariate/solvers/brent.jl
@@ -27,7 +27,7 @@
     @test Optim.converged(result)
     @test Optim.minimum(result) == -1.0
 
-    ## time limit 
+    ## time limit
     function slow_obj(x)
         sleep(0.05)
         return sin(x)
diff --git a/test/univariate/solvers/golden_section.jl b/test/univariate/solvers/golden_section.jl
@@ -16,7 +16,7 @@
     @test Optim.minimum(result) == 2.0
     @test_throws ErrorException optimize(identity, 2.0, 1.0, GoldenSection())
 
-    ## time limit 
+    ## time limit
     function slow_obj(x)
         sleep(0.05)
         return sin(x)