From 1b4244dd60f543f8a55d30ba1e8b4a1bc2863874 Mon Sep 17 00:00:00 2001 From: "RAYNAUD Paul (raynaudp)" Date: Thu, 12 Jan 2023 17:03:43 +0200 Subject: [PATCH 01/16] CompressedLBFGS structure + Matrix interface + mul! (first version) --- src/compressed_lbfgs.jl | 143 ++++++++++++++++++++++++++++++++++++++++ src/qn.jl | 2 + 2 files changed, 145 insertions(+) create mode 100644 src/compressed_lbfgs.jl diff --git a/src/compressed_lbfgs.jl b/src/compressed_lbfgs.jl new file mode 100644 index 00000000..98908a00 --- /dev/null +++ b/src/compressed_lbfgs.jl @@ -0,0 +1,143 @@ +#= +Compressed LBFGS implementation from: + REPRESENTATIONS OF QUASI-NEWTON MATRICES AND THEIR USE IN LIMITED MEMORY METHODS + Richard H. Byrd, Jorge Nocedal and Robert B. Schnabel (1994) + DOI: 10.1007/BF01582063 + +Implemented by Paul Raynaud (supervised by Dominique Orban) +=# + +using LinearAlgebra + +export CompressedLBFGS + +mutable struct CompressedLBFGS{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}} + m::Int # memory of the operator + n::Int # vector size + k::Int # k ≤ m, active memory of the operator + α::T # B₀ = αI + Sₖ::M # gather all sₖ₋ₘ + Yₖ::M # gather all yₖ₋ₘ + Dₖ::Diagonal{T,V} # m * m + Lₖ::LowerTriangular{T,M} # m * m + + chol_matrix::M # 2m * 2m + intermediate_1::UpperTriangular{T,M} # 2m * 2m + intermediate_2::LowerTriangular{T,M} # 2m * 2m + inverse_intermediate_1::UpperTriangular{T,M} # 2m * 2m + inverse_intermediate_2::LowerTriangular{T,M} # 2m * 2m + sol::V # m + inverse::Bool +end + +default_matrix_type(gpu::Bool, T::DataType) = gpu ? CuMatrix{T} : Matrix{T} +default_vector_type(gpu::Bool, T::DataType) = gpu ? CuVector{T} : Vector{T} + +function CompressedLBFGS(m::Int, n::Int; T=Float64, gpu=false, M=default_matrix_type(gpu,T), V=default_vector_type(gpu,T)) + α = (T)(1) + k = 0 + Sₖ = M(undef,n,m) + Yₖ = M(undef,n,m) + Dₖ = Diagonal(V(undef,m)) + Lₖ = LowerTriangular(M(undef,m,m)) + + chol_matrix = M(undef,m,m) + intermediate_1 = UpperTriangular(M(undef,2*m,2*m)) + intermediate_2 = LowerTriangular(M(undef,2*m,2*m)) + inverse_intermediate_1 = UpperTriangular(M(undef,2*m,2*m)) + inverse_intermediate_2 = LowerTriangular(M(undef,2*m,2*m)) + sol = V(undef,2*m) + inverse = false + return CompressedLBFGS{T,M,V}(m, n, k, α, Sₖ, Yₖ, Dₖ, Lₖ, chol_matrix, intermediate_1, intermediate_2, inverse_intermediate_1, inverse_intermediate_2, sol, inverse) +end + +function Base.push!(op::CompressedLBFGS{T,M,V}, s::V, y::V) where {T,M,V<:AbstractVector{T}} + if op.k < op.m # still some place in structures + op.k += 1 + op.Sₖ[:,op.k] .= s + op.Yₖ[:,op.k] .= y + op.Dₖ.diag[op.k] = dot(s,y) + op.Lₖ.data[op.k, op.k] = 0 + for i in 1:op.k-1 + op.Lₖ.data[op.k, i] = dot(s,op.Yₖ[:,i]) + end + # the secan equation fails if this line is uncommented + # op.α = dot(y,s)/dot(s,s) + else # update matrix with circular shift + # must be tested + circshift(op.Sₖ, (0,-1)) + circshift(op.Yₖ, (0,-1)) + circshift(op.Dₖ, (-1,-1)) + # circshift doesn't work for a LowerTriangular matrix + for j in 2:op.k + for i in 1:j-1 + op.Lₖ.data[j, i] = dot(op.Sₖ[:,j],op.Yₖ[:,i]) + end + end + end + op.inverse = false + return op +end + +# Theorem 2.3 (p6) +function Base.Matrix(op::CompressedLBFGS{T,M,V}) where {T,M,V} + B₀ = M(zeros(T,op.n, op.n)) + map(i -> B₀[i,i] = op.α, 1:op.n) + + BSY = M(undef, op.n, 2*op.k) + (op.k > 0) && (BSY[:,1:op.k] = B₀ * op.Sₖ[:,1:op.k]) + (op.k > 0) && (BSY[:,op.k+1:2*op.k] = op.Yₖ[:,1:op.k]) + _C = M(undef, 2*op.k, 2*op.k) + (op.k > 0) && (_C[1:op.k, 1:op.k] .= transpose(op.Sₖ[:,1:op.k]) * op.Sₖ[:,1:op.k]) + (op.k > 0) && (_C[1:op.k, op.k+1:2*op.k] .= op.Lₖ[1:op.k,1:op.k]) + (op.k > 0) && (_C[op.k+1:2*op.k, 1:op.k] .= transpose(op.Lₖ[1:op.k,1:op.k])) + (op.k > 0) && (_C[op.k+1:2*op.k, op.k+1:2*op.k] .= .- op.Dₖ[1:op.k,1:op.k]) + C = inv(_C) + + Bₖ = B₀ .- BSY * C * transpose(BSY) + return Bₖ +end + +function inverse_cholesky(op::CompressedLBFGS) + if !op.inverse + op.chol_matrix[1:op.k,1:op.k] .= op.α .* (transpose(op.Sₖ[:,1:op.k]) * op.Sₖ[:,1:op.k]) .+ op.Lₖ[1:op.k,1:op.k] * inv(op.Dₖ[1:op.k,1:op.k]) * transpose(op.Lₖ[1:op.k,1:op.k]) + cholesky!(view(op.chol_matrix,1:op.k,1:op.k)) + op.inverse = true + end + Jₖ = transpose(UpperTriangular(op.chol_matrix[1:op.k,1:op.k])) + return Jₖ +end + +# Algorithm 3.2 (p15) +function LinearAlgebra.mul!(Bv::V, op::CompressedLBFGS{T,M,V}, v::V) where {T,M,V<:AbstractVector{T}} + # step 1-3 mainly done by Base.push! + # step 4, Jₖ is computed only if needed + Jₖ = inverse_cholesky(op::CompressedLBFGS) + + # step 5, try views for mul! + # mul!(op.sol[1:op.k], transpose(op.Yₖ[:,1:op.k]), v) # wrong result + # mul!(op.sol[op.k+1:2*op.k], transpose(op.Yₖ[:,1:op.k]), v, (T)(1), op.α) # wrong result + op.sol[1:op.k] .= transpose(op.Yₖ[:,1:op.k]) * v + op.sol[op.k+1:2*op.k] .= op.α .* transpose(op.Sₖ[:,1:op.k]) * v + + # step 6, must be improve + op.intermediate_1[1:op.k,1:op.k] .= .- op.Dₖ[1:op.k,1:op.k]^(1/2) + op.intermediate_1[1:op.k,op.k+1:2*op.k] .= op.Dₖ[1:op.k,1:op.k]^(-1/2) * transpose(op.Lₖ[1:op.k,1:op.k]) + op.intermediate_1[op.k+1:2*op.k,1:op.k] .= 0 + op.intermediate_1[op.k+1:2*op.k,op.k+1:2*op.k] .= transpose(Jₖ) + + op.intermediate_2[1:op.k,1:op.k] .= op.Dₖ[1:op.k,1:op.k]^(1/2) + op.intermediate_2[1:op.k,op.k+1:2*op.k] .= 0 + op.intermediate_2[op.k+1:2*op.k,1:op.k] .= .- op.Lₖ[1:op.k,1:op.k] * op.Dₖ[1:op.k,1:op.k]^(-1/2) + op.intermediate_2[op.k+1:2*op.k,op.k+1:2*op.k] .= Jₖ + + op.inverse_intermediate_1[1:2*op.k,1:2*op.k] .= inv(op.intermediate_1[1:2*op.k,1:2*op.k]) + op.inverse_intermediate_2[1:2*op.k,1:2*op.k] .= inv(op.intermediate_2[1:2*op.k,1:2*op.k]) + + op.sol[1:2*op.k] .= op.inverse_intermediate_1[1:2*op.k,1:2*op.k] * (op.inverse_intermediate_2[1:2*op.k,1:2*op.k] * op.sol[1:2*op.k]) + + # step 7 + Bv .= op.α .* v .- (op.Yₖ[:,1:op.k] * op.sol[1:op.k] .+ op.α .* op.Sₖ[:,1:op.k] * op.sol[op.k+1:2*op.k]) + + return Bv +end \ No newline at end of file diff --git a/src/qn.jl b/src/qn.jl index 0cec28f0..3b2a335e 100644 --- a/src/qn.jl +++ b/src/qn.jl @@ -5,3 +5,5 @@ import LinearAlgebra.diag include("lbfgs.jl") include("lsr1.jl") + +include("compressed_lbfgs.jl") \ No newline at end of file From 647b19e7fceecbf95cce448d9e90cb8234acb364 Mon Sep 17 00:00:00 2001 From: "RAYNAUD Paul (raynaudp)" Date: Mon, 16 Jan 2023 15:43:57 +0200 Subject: [PATCH 02/16] allocation free for the core of the mul! method --- Project.toml | 2 + src/compressed_lbfgs.jl | 87 ++++++++++++++++++++++++----------------- 2 files changed, 53 insertions(+), 36 deletions(-) diff --git a/Project.toml b/Project.toml index 097267c3..1fb13163 100644 --- a/Project.toml +++ b/Project.toml @@ -3,6 +3,7 @@ uuid = "5c8ed15e-5a4c-59e4-a42b-c7e8811fb125" version = "2.5.1" [deps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" FastClosures = "9aa1b823-49e4-5ca5-8b0f-3971ec8bab6a" LDLFactorizations = "40e66cde-538c-5869-a4ad-c39174c6795b" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -11,6 +12,7 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" [compat] +CUDA = "3.12.1" FastClosures = "0.2, 0.3" LDLFactorizations = "0.9, 0.10" TimerOutputs = "^0.5" diff --git a/src/compressed_lbfgs.jl b/src/compressed_lbfgs.jl index 98908a00..b8077008 100644 --- a/src/compressed_lbfgs.jl +++ b/src/compressed_lbfgs.jl @@ -7,7 +7,8 @@ Compressed LBFGS implementation from: Implemented by Paul Raynaud (supervised by Dominique Orban) =# -using LinearAlgebra +using LinearAlgebra, LinearAlgebra.BLAS +using CUDA export CompressedLBFGS @@ -26,14 +27,16 @@ mutable struct CompressedLBFGS{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}} intermediate_2::LowerTriangular{T,M} # 2m * 2m inverse_intermediate_1::UpperTriangular{T,M} # 2m * 2m inverse_intermediate_2::LowerTriangular{T,M} # 2m * 2m + intermediary_vector::V # 2m sol::V # m - inverse::Bool + intermediate_structure_updated::Bool end +default_gpu() = CUDA.functional() ? true : false default_matrix_type(gpu::Bool, T::DataType) = gpu ? CuMatrix{T} : Matrix{T} default_vector_type(gpu::Bool, T::DataType) = gpu ? CuVector{T} : Vector{T} -function CompressedLBFGS(m::Int, n::Int; T=Float64, gpu=false, M=default_matrix_type(gpu,T), V=default_vector_type(gpu,T)) +function CompressedLBFGS(m::Int, n::Int; T=Float64, gpu=default_gpu(), M=default_matrix_type(gpu,T), V=default_vector_type(gpu,T)) α = (T)(1) k = 0 Sₖ = M(undef,n,m) @@ -46,9 +49,10 @@ function CompressedLBFGS(m::Int, n::Int; T=Float64, gpu=false, M=default_matrix_ intermediate_2 = LowerTriangular(M(undef,2*m,2*m)) inverse_intermediate_1 = UpperTriangular(M(undef,2*m,2*m)) inverse_intermediate_2 = LowerTriangular(M(undef,2*m,2*m)) + intermediary_vector = V(undef,2*m) sol = V(undef,2*m) - inverse = false - return CompressedLBFGS{T,M,V}(m, n, k, α, Sₖ, Yₖ, Dₖ, Lₖ, chol_matrix, intermediate_1, intermediate_2, inverse_intermediate_1, inverse_intermediate_2, sol, inverse) + intermediate_structure_updated = false + return CompressedLBFGS{T,M,V}(m, n, k, α, Sₖ, Yₖ, Dₖ, Lₖ, chol_matrix, intermediate_1, intermediate_2, inverse_intermediate_1, inverse_intermediate_2, intermediary_vector, sol, intermediate_structure_updated) end function Base.push!(op::CompressedLBFGS{T,M,V}, s::V, y::V) where {T,M,V<:AbstractVector{T}} @@ -69,13 +73,14 @@ function Base.push!(op::CompressedLBFGS{T,M,V}, s::V, y::V) where {T,M,V<:Abstra circshift(op.Yₖ, (0,-1)) circshift(op.Dₖ, (-1,-1)) # circshift doesn't work for a LowerTriangular matrix + # for the time being, reinstantiate completely the Lₖ matrix for j in 2:op.k for i in 1:j-1 op.Lₖ.data[j, i] = dot(op.Sₖ[:,j],op.Yₖ[:,i]) end end end - op.inverse = false + op.intermediate_structure_updated = false return op end @@ -98,46 +103,56 @@ function Base.Matrix(op::CompressedLBFGS{T,M,V}) where {T,M,V} return Bₖ end +# step 4, Jₖ is computed only if needed function inverse_cholesky(op::CompressedLBFGS) - if !op.inverse - op.chol_matrix[1:op.k,1:op.k] .= op.α .* (transpose(op.Sₖ[:,1:op.k]) * op.Sₖ[:,1:op.k]) .+ op.Lₖ[1:op.k,1:op.k] * inv(op.Dₖ[1:op.k,1:op.k]) * transpose(op.Lₖ[1:op.k,1:op.k]) - cholesky!(view(op.chol_matrix,1:op.k,1:op.k)) - op.inverse = true - end - Jₖ = transpose(UpperTriangular(op.chol_matrix[1:op.k,1:op.k])) + view(op.chol_matrix, 1:op.k, 1:op.k) .= op.α .* (transpose(view(op.Sₖ, :, 1:op.k)) * view(op.Sₖ, :, 1:op.k)) .+ view(op.Lₖ, 1:op.k, 1:op.k) * inv(op.Dₖ[1:op.k, 1:op.k]) * transpose(view(op.Lₖ, 1:op.k, 1:op.k)) + cholesky!(view(op.chol_matrix,1:op.k,1:op.k)) + Jₖ = transpose(UpperTriangular(view(op.chol_matrix, 1:op.k, 1:op.k))) return Jₖ end +# step 6, must be improve +function precompile_iterated_structure!(op::CompressedLBFGS) + Jₖ = inverse_cholesky(op) + + view(op.intermediate_1, 1:op.k,1:op.k) .= .- view(op.Dₖ, 1:op.k, 1:op.k)^(1/2) + view(op.intermediate_1, 1:op.k,op.k+1:2*op.k) .= view(op.Dₖ, 1:op.k, 1:op.k)^(-1/2) * transpose(view(op.Lₖ, 1:op.k, 1:op.k)) + view(op.intermediate_1, op.k+1:2*op.k, 1:op.k) .= 0 + view(op.intermediate_1, op.k+1:2*op.k, op.k+1:2*op.k) .= transpose(Jₖ) + + view(op.intermediate_2, 1:op.k, 1:op.k) .= view(op.Dₖ, 1:op.k, 1:op.k)^(1/2) + view(op.intermediate_2, 1:op.k, op.k+1:2*op.k) .= 0 + view(op.intermediate_2, op.k+1:2*op.k, 1:op.k) .= .- view(op.Lₖ, 1:op.k, 1:op.k) * view(op.Dₖ, 1:op.k, 1:op.k)^(-1/2) + view(op.intermediate_2, op.k+1:2*op.k, op.k+1:2*op.k) .= Jₖ + + view(op.inverse_intermediate_1, 1:2*op.k, 1:2*op.k) .= inv(op.intermediate_1[ 1:2*op.k,1:2*op.k]) + view(op.inverse_intermediate_2, 1:2*op.k, 1:2*op.k) .= inv(op.intermediate_2[ 1:2*op.k,1:2*op.k]) + + op.intermediate_structure_updated = true +end + # Algorithm 3.2 (p15) function LinearAlgebra.mul!(Bv::V, op::CompressedLBFGS{T,M,V}, v::V) where {T,M,V<:AbstractVector{T}} # step 1-3 mainly done by Base.push! - # step 4, Jₖ is computed only if needed - Jₖ = inverse_cholesky(op::CompressedLBFGS) + + # steps 4 and 6, in case the intermediary required structure are not up to date + (!op.intermediate_structure_updated) && (precompile_iterated_structure!(op)) # step 5, try views for mul! - # mul!(op.sol[1:op.k], transpose(op.Yₖ[:,1:op.k]), v) # wrong result - # mul!(op.sol[op.k+1:2*op.k], transpose(op.Yₖ[:,1:op.k]), v, (T)(1), op.α) # wrong result - op.sol[1:op.k] .= transpose(op.Yₖ[:,1:op.k]) * v - op.sol[op.k+1:2*op.k] .= op.α .* transpose(op.Sₖ[:,1:op.k]) * v - - # step 6, must be improve - op.intermediate_1[1:op.k,1:op.k] .= .- op.Dₖ[1:op.k,1:op.k]^(1/2) - op.intermediate_1[1:op.k,op.k+1:2*op.k] .= op.Dₖ[1:op.k,1:op.k]^(-1/2) * transpose(op.Lₖ[1:op.k,1:op.k]) - op.intermediate_1[op.k+1:2*op.k,1:op.k] .= 0 - op.intermediate_1[op.k+1:2*op.k,op.k+1:2*op.k] .= transpose(Jₖ) - - op.intermediate_2[1:op.k,1:op.k] .= op.Dₖ[1:op.k,1:op.k]^(1/2) - op.intermediate_2[1:op.k,op.k+1:2*op.k] .= 0 - op.intermediate_2[op.k+1:2*op.k,1:op.k] .= .- op.Lₖ[1:op.k,1:op.k] * op.Dₖ[1:op.k,1:op.k]^(-1/2) - op.intermediate_2[op.k+1:2*op.k,op.k+1:2*op.k] .= Jₖ - - op.inverse_intermediate_1[1:2*op.k,1:2*op.k] .= inv(op.intermediate_1[1:2*op.k,1:2*op.k]) - op.inverse_intermediate_2[1:2*op.k,1:2*op.k] .= inv(op.intermediate_2[1:2*op.k,1:2*op.k]) - - op.sol[1:2*op.k] .= op.inverse_intermediate_1[1:2*op.k,1:2*op.k] * (op.inverse_intermediate_2[1:2*op.k,1:2*op.k] * op.sol[1:2*op.k]) + mul!(view(op.sol, 1:op.k), transpose(view(op.Yₖ, :, 1:op.k)), v) + mul!(view(op.sol, op.k+1:2*op.k), transpose(view(op.Sₖ, :,1:op.k)), v) + # scal!(op.α, view(op.sol, op.k+1:2*op.k)) # more allocation, slower + view(op.sol, op.k+1:2*op.k) .*= op.α + + # view(op.sol, 1:2*op.k) .= view(op.inverse_intermediate_1, 1:2*op.k, 1:2*op.k) * (view(op.inverse_intermediate_2, 1:2*op.k, 1:2*op.k) * view(op.sol, 1:2*op.k)) + mul!(view(op.intermediary_vector, 1:2*op.k), view(op.inverse_intermediate_2, 1:2*op.k, 1:2*op.k), view(op.sol, 1:2*op.k)) + mul!(view(op.sol, 1:2*op.k), view(op.inverse_intermediate_1, 1:2*op.k, 1:2*op.k), view(op.intermediary_vector, 1:2*op.k)) # step 7 - Bv .= op.α .* v .- (op.Yₖ[:,1:op.k] * op.sol[1:op.k] .+ op.α .* op.Sₖ[:,1:op.k] * op.sol[op.k+1:2*op.k]) - + # Bv .= op.α .* v .- (view(op.Yₖ, :,1:op.k) * view(op.sol, 1:op.k) .+ op.α .* view(op.Sₖ, :, 1:op.k) * view(op.sol, op.k+1:2*op.k)) + + mul!(Bv, view(op.Yₖ, :, 1:op.k), view(op.sol, 1:op.k)) + mul!(Bv, view(op.Sₖ, :, 1:op.k), view(op.sol, op.k+1:2*op.k), - op.α, (T)(-1)) + Bv .+= op.α .* v return Bv end \ No newline at end of file From 5ca9000603e3ec7e1fea13f36b866eaeb2eba2c2 Mon Sep 17 00:00:00 2001 From: "RAYNAUD Paul (raynaudp)" Date: Mon, 16 Jan 2023 22:59:36 +0200 Subject: [PATCH 03/16] circular push! --- src/compressed_lbfgs.jl | 79 +++++++++++++++++++++++------------------ 1 file changed, 44 insertions(+), 35 deletions(-) diff --git a/src/compressed_lbfgs.jl b/src/compressed_lbfgs.jl index b8077008..00ab7db0 100644 --- a/src/compressed_lbfgs.jl +++ b/src/compressed_lbfgs.jl @@ -36,21 +36,21 @@ default_gpu() = CUDA.functional() ? true : false default_matrix_type(gpu::Bool, T::DataType) = gpu ? CuMatrix{T} : Matrix{T} default_vector_type(gpu::Bool, T::DataType) = gpu ? CuVector{T} : Vector{T} -function CompressedLBFGS(m::Int, n::Int; T=Float64, gpu=default_gpu(), M=default_matrix_type(gpu,T), V=default_vector_type(gpu,T)) +function CompressedLBFGS(m::Int, n::Int; T=Float64, gpu=default_gpu(), M=default_matrix_type(gpu, T), V=default_vector_type(gpu, T)) α = (T)(1) k = 0 - Sₖ = M(undef,n,m) - Yₖ = M(undef,n,m) - Dₖ = Diagonal(V(undef,m)) - Lₖ = LowerTriangular(M(undef,m,m)) - - chol_matrix = M(undef,m,m) - intermediate_1 = UpperTriangular(M(undef,2*m,2*m)) - intermediate_2 = LowerTriangular(M(undef,2*m,2*m)) - inverse_intermediate_1 = UpperTriangular(M(undef,2*m,2*m)) - inverse_intermediate_2 = LowerTriangular(M(undef,2*m,2*m)) - intermediary_vector = V(undef,2*m) - sol = V(undef,2*m) + Sₖ = M(undef, n, m) + Yₖ = M(undef, n, m) + Dₖ = Diagonal(V(undef, m)) + Lₖ = LowerTriangular(M(undef, m, m)) + + chol_matrix = M(undef, m, m) + intermediate_1 = UpperTriangular(M(undef, 2*m, 2*m)) + intermediate_2 = LowerTriangular(M(undef, 2*m, 2*m)) + inverse_intermediate_1 = UpperTriangular(M(undef, 2*m, 2*m)) + inverse_intermediate_2 = LowerTriangular(M(undef, 2*m, 2*m)) + intermediary_vector = V(undef, 2*m) + sol = V(undef, 2*m) intermediate_structure_updated = false return CompressedLBFGS{T,M,V}(m, n, k, α, Sₖ, Yₖ, Dₖ, Lₖ, chol_matrix, intermediate_1, intermediate_2, inverse_intermediate_1, inverse_intermediate_2, intermediary_vector, sol, intermediate_structure_updated) end @@ -58,45 +58,54 @@ end function Base.push!(op::CompressedLBFGS{T,M,V}, s::V, y::V) where {T,M,V<:AbstractVector{T}} if op.k < op.m # still some place in structures op.k += 1 - op.Sₖ[:,op.k] .= s - op.Yₖ[:,op.k] .= y - op.Dₖ.diag[op.k] = dot(s,y) + op.Sₖ[:, op.k] .= s + op.Yₖ[:, op.k] .= y + op.Dₖ.diag[op.k] = dot(s, y) op.Lₖ.data[op.k, op.k] = 0 for i in 1:op.k-1 - op.Lₖ.data[op.k, i] = dot(s,op.Yₖ[:,i]) + # op.Lₖ.data[op.k, i] = dot(s, op.Yₖ[:, i]) + op.Lₖ.data[op.k, i] = dot(op.Sₖ[:, op.k], op.Yₖ[:, i]) end # the secan equation fails if this line is uncommented - # op.α = dot(y,s)/dot(s,s) else # update matrix with circular shift + println("else") # must be tested - circshift(op.Sₖ, (0,-1)) - circshift(op.Yₖ, (0,-1)) - circshift(op.Dₖ, (-1,-1)) + op.Sₖ .= circshift(op.Sₖ, (0, -1)) + op.Yₖ .= circshift(op.Yₖ, (0, -1)) + op.Dₖ .= circshift(op.Dₖ, (-1, -1)) + op.Sₖ[:, op.k] .= s + op.Yₖ[:, op.k] .= y + op.Dₖ.diag[op.k] = dot(s, y) # circshift doesn't work for a LowerTriangular matrix # for the time being, reinstantiate completely the Lₖ matrix - for j in 2:op.k + for j in 1:op.k for i in 1:j-1 - op.Lₖ.data[j, i] = dot(op.Sₖ[:,j],op.Yₖ[:,i]) + op.Lₖ.data[j, i] = dot(op.Sₖ[:, j], op.Yₖ[:, i]) end end end + @show op.Lₖ + @show op.Sₖ + @show op.Yₖ + @show op.Dₖ + # op.α = dot(y,s)/dot(s,s) op.intermediate_structure_updated = false return op end # Theorem 2.3 (p6) function Base.Matrix(op::CompressedLBFGS{T,M,V}) where {T,M,V} - B₀ = M(zeros(T,op.n, op.n)) - map(i -> B₀[i,i] = op.α, 1:op.n) + B₀ = M(zeros(T, op.n, op.n)) + map(i -> B₀[i, i] = op.α, 1:op.n) BSY = M(undef, op.n, 2*op.k) - (op.k > 0) && (BSY[:,1:op.k] = B₀ * op.Sₖ[:,1:op.k]) - (op.k > 0) && (BSY[:,op.k+1:2*op.k] = op.Yₖ[:,1:op.k]) + (op.k > 0) && (BSY[:, 1:op.k] = B₀ * op.Sₖ[:, 1:op.k]) + (op.k > 0) && (BSY[:, op.k+1:2*op.k] = op.Yₖ[:, 1:op.k]) _C = M(undef, 2*op.k, 2*op.k) - (op.k > 0) && (_C[1:op.k, 1:op.k] .= transpose(op.Sₖ[:,1:op.k]) * op.Sₖ[:,1:op.k]) - (op.k > 0) && (_C[1:op.k, op.k+1:2*op.k] .= op.Lₖ[1:op.k,1:op.k]) - (op.k > 0) && (_C[op.k+1:2*op.k, 1:op.k] .= transpose(op.Lₖ[1:op.k,1:op.k])) - (op.k > 0) && (_C[op.k+1:2*op.k, op.k+1:2*op.k] .= .- op.Dₖ[1:op.k,1:op.k]) + (op.k > 0) && (_C[1:op.k, 1:op.k] .= transpose(op.Sₖ[:, 1:op.k]) * op.Sₖ[:, 1:op.k]) + (op.k > 0) && (_C[1:op.k, op.k+1:2*op.k] .= op.Lₖ[1:op.k, 1:op.k]) + (op.k > 0) && (_C[op.k+1:2*op.k, 1:op.k] .= transpose(op.Lₖ[1:op.k, 1:op.k])) + (op.k > 0) && (_C[op.k+1:2*op.k, op.k+1:2*op.k] .= .- op.Dₖ[1:op.k, 1:op.k]) C = inv(_C) Bₖ = B₀ .- BSY * C * transpose(BSY) @@ -106,7 +115,7 @@ end # step 4, Jₖ is computed only if needed function inverse_cholesky(op::CompressedLBFGS) view(op.chol_matrix, 1:op.k, 1:op.k) .= op.α .* (transpose(view(op.Sₖ, :, 1:op.k)) * view(op.Sₖ, :, 1:op.k)) .+ view(op.Lₖ, 1:op.k, 1:op.k) * inv(op.Dₖ[1:op.k, 1:op.k]) * transpose(view(op.Lₖ, 1:op.k, 1:op.k)) - cholesky!(view(op.chol_matrix,1:op.k,1:op.k)) + cholesky!(Symmetric(view(op.chol_matrix, 1:op.k, 1:op.k))) Jₖ = transpose(UpperTriangular(view(op.chol_matrix, 1:op.k, 1:op.k))) return Jₖ end @@ -125,8 +134,8 @@ function precompile_iterated_structure!(op::CompressedLBFGS) view(op.intermediate_2, op.k+1:2*op.k, 1:op.k) .= .- view(op.Lₖ, 1:op.k, 1:op.k) * view(op.Dₖ, 1:op.k, 1:op.k)^(-1/2) view(op.intermediate_2, op.k+1:2*op.k, op.k+1:2*op.k) .= Jₖ - view(op.inverse_intermediate_1, 1:2*op.k, 1:2*op.k) .= inv(op.intermediate_1[ 1:2*op.k,1:2*op.k]) - view(op.inverse_intermediate_2, 1:2*op.k, 1:2*op.k) .= inv(op.intermediate_2[ 1:2*op.k,1:2*op.k]) + view(op.inverse_intermediate_1, 1:2*op.k, 1:2*op.k) .= inv(op.intermediate_1[1:2*op.k, 1:2*op.k]) + view(op.inverse_intermediate_2, 1:2*op.k, 1:2*op.k) .= inv(op.intermediate_2[1:2*op.k, 1:2*op.k]) op.intermediate_structure_updated = true end @@ -140,7 +149,7 @@ function LinearAlgebra.mul!(Bv::V, op::CompressedLBFGS{T,M,V}, v::V) where {T,M, # step 5, try views for mul! mul!(view(op.sol, 1:op.k), transpose(view(op.Yₖ, :, 1:op.k)), v) - mul!(view(op.sol, op.k+1:2*op.k), transpose(view(op.Sₖ, :,1:op.k)), v) + mul!(view(op.sol, op.k+1:2*op.k), transpose(view(op.Sₖ, :, 1:op.k)), v) # scal!(op.α, view(op.sol, op.k+1:2*op.k)) # more allocation, slower view(op.sol, op.k+1:2*op.k) .*= op.α From 3ff759bba039902bbdc5d14a0134c864ca4139cd Mon Sep 17 00:00:00 2001 From: "RAYNAUD Paul (raynaudp)" Date: Tue, 17 Jan 2023 00:03:24 +0200 Subject: [PATCH 04/16] add documentation --- src/compressed_lbfgs.jl | 55 ++++++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/src/compressed_lbfgs.jl b/src/compressed_lbfgs.jl index 00ab7db0..fa313031 100644 --- a/src/compressed_lbfgs.jl +++ b/src/compressed_lbfgs.jl @@ -12,6 +12,33 @@ using CUDA export CompressedLBFGS +""" + CompressedLBFGS{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}} + +A LBFGS limited-memory operator. +It represents a linear application Rⁿˣⁿ, considering at most `m` BFGS updates. +This implementation considers the bloc matrices reoresentation of the BFGS (forward) update. +It follows the algorithm described in [REPRESENTATIONS OF QUASI-NEWTON MATRICES AND THEIR USE IN LIMITED MEMORY METHODS](https://link.springer.com/article/10.1007/BF01582063) from Richard H. Byrd, Jorge Nocedal and Robert B. Schnabel (1994). +This operator considers several fields directly related to the bloc representation of the operator: +- `m`: the maximal memory of the operator; +- `n`: the dimension of the linear application; +- `k`: the current memory's size of the operator; +- `α`: scalar for `B₀ = α I`; +- `Sₖ`: retain the `k`-th last vectors `s` from the updates parametrized by `(s,y)`; +- `Yₖ`: retain the `k`-th last vectors `y` from the updates parametrized by `(s,y)`;; +- `Dₖ`: a diagonal matrix mandatory to perform the linear application and to form the matrix; +- `Lₖ`: a lower diagonal mandatory to perform the linear application and to form the matrix. +In addition to this structures which are circurlarly update when `k` reaches `m`, we consider other intermediate data structures renew at each update: +- `chol_matrix`: a matrix required to store a Cholesky factorization of a Rᵏˣᵏ matrix; +- `intermediate_1`: a R²ᵏˣ²ᵏ matrix; +- `intermediate_2`: a R²ᵏˣ²ᵏ matrix; +- `inverse_intermediate_1`: a R²ᵏˣ²ᵏ matrix; +- `inverse_intermediate_2`: a R²ᵏˣ²ᵏ matrix; +- `intermediary_vector`: a vector ∈ Rᵏ to store intermediate solutions; +- `sol`: a vector ∈ Rᵏ to store intermediate solutions; +- `intermediate_structure_updated`: inform if the intermediate structures are up-to-date or not. +This implementation is designed to work either on CPU or GPU. +""" mutable struct CompressedLBFGS{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}} m::Int # memory of the operator n::Int # vector size @@ -36,7 +63,13 @@ default_gpu() = CUDA.functional() ? true : false default_matrix_type(gpu::Bool, T::DataType) = gpu ? CuMatrix{T} : Matrix{T} default_vector_type(gpu::Bool, T::DataType) = gpu ? CuVector{T} : Vector{T} -function CompressedLBFGS(m::Int, n::Int; T=Float64, gpu=default_gpu(), M=default_matrix_type(gpu, T), V=default_vector_type(gpu, T)) +""" + CompressedLBFGS(n::Int; [T=Float64, m=5], gpu:Bool) + +A implementation of a LBFGS operator (forward), representing a `nxn` linear application. +It considers at most `k` BFGS iterates, and fit the architecture depending if it is launched on a CPU or a GPU. +""" +function CompressedLBFGS(n::Int; m::Int=5, T=Float64, gpu=default_gpu(), M=default_matrix_type(gpu, T), V=default_vector_type(gpu, T)) α = (T)(1) k = 0 Sₖ = M(undef, n, m) @@ -56,20 +89,16 @@ function CompressedLBFGS(m::Int, n::Int; T=Float64, gpu=default_gpu(), M=default end function Base.push!(op::CompressedLBFGS{T,M,V}, s::V, y::V) where {T,M,V<:AbstractVector{T}} - if op.k < op.m # still some place in structures + if op.k < op.m # still some place in the structures op.k += 1 op.Sₖ[:, op.k] .= s op.Yₖ[:, op.k] .= y op.Dₖ.diag[op.k] = dot(s, y) op.Lₖ.data[op.k, op.k] = 0 for i in 1:op.k-1 - # op.Lₖ.data[op.k, i] = dot(s, op.Yₖ[:, i]) op.Lₖ.data[op.k, i] = dot(op.Sₖ[:, op.k], op.Yₖ[:, i]) end - # the secan equation fails if this line is uncommented - else # update matrix with circular shift - println("else") - # must be tested + else # k == m update circurlarly the intermediary structures op.Sₖ .= circshift(op.Sₖ, (0, -1)) op.Yₖ .= circshift(op.Yₖ, (0, -1)) op.Dₖ .= circshift(op.Dₖ, (-1, -1)) @@ -84,15 +113,13 @@ function Base.push!(op::CompressedLBFGS{T,M,V}, s::V, y::V) where {T,M,V<:Abstra end end end - @show op.Lₖ - @show op.Sₖ - @show op.Yₖ - @show op.Dₖ + # secant equation fails if uncommented # op.α = dot(y,s)/dot(s,s) op.intermediate_structure_updated = false return op end +# Algorithm 3.2 (p15) # Theorem 2.3 (p6) function Base.Matrix(op::CompressedLBFGS{T,M,V}) where {T,M,V} B₀ = M(zeros(T, op.n, op.n)) @@ -112,6 +139,7 @@ function Base.Matrix(op::CompressedLBFGS{T,M,V}) where {T,M,V} return Bₖ end +# Algorithm 3.2 (p15) # step 4, Jₖ is computed only if needed function inverse_cholesky(op::CompressedLBFGS) view(op.chol_matrix, 1:op.k, 1:op.k) .= op.α .* (transpose(view(op.Sₖ, :, 1:op.k)) * view(op.Sₖ, :, 1:op.k)) .+ view(op.Lₖ, 1:op.k, 1:op.k) * inv(op.Dₖ[1:op.k, 1:op.k]) * transpose(view(op.Lₖ, 1:op.k, 1:op.k)) @@ -144,10 +172,10 @@ end function LinearAlgebra.mul!(Bv::V, op::CompressedLBFGS{T,M,V}, v::V) where {T,M,V<:AbstractVector{T}} # step 1-3 mainly done by Base.push! - # steps 4 and 6, in case the intermediary required structure are not up to date + # steps 4 and 6, in case the intermediary structures required are not up to date (!op.intermediate_structure_updated) && (precompile_iterated_structure!(op)) - # step 5, try views for mul! + # step 5 mul!(view(op.sol, 1:op.k), transpose(view(op.Yₖ, :, 1:op.k)), v) mul!(view(op.sol, op.k+1:2*op.k), transpose(view(op.Sₖ, :, 1:op.k)), v) # scal!(op.α, view(op.sol, op.k+1:2*op.k)) # more allocation, slower @@ -159,7 +187,6 @@ function LinearAlgebra.mul!(Bv::V, op::CompressedLBFGS{T,M,V}, v::V) where {T,M, # step 7 # Bv .= op.α .* v .- (view(op.Yₖ, :,1:op.k) * view(op.sol, 1:op.k) .+ op.α .* view(op.Sₖ, :, 1:op.k) * view(op.sol, op.k+1:2*op.k)) - mul!(Bv, view(op.Yₖ, :, 1:op.k), view(op.sol, 1:op.k)) mul!(Bv, view(op.Sₖ, :, 1:op.k), view(op.sol, op.k+1:2*op.k), - op.α, (T)(-1)) Bv .+= op.α .* v From 7d0ef3d136a092680e04925ac7c0b87e0453e96a Mon Sep 17 00:00:00 2001 From: "RAYNAUD Paul (raynaudp)" Date: Tue, 17 Jan 2023 16:39:27 +0200 Subject: [PATCH 05/16] add tests + change to satisfy the tests --- src/compressed_lbfgs.jl | 2 +- test/runtests.jl | 1 + test/test_clbfgs.jl | 17 +++++++++++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 test/test_clbfgs.jl diff --git a/src/compressed_lbfgs.jl b/src/compressed_lbfgs.jl index fa313031..e17669cf 100644 --- a/src/compressed_lbfgs.jl +++ b/src/compressed_lbfgs.jl @@ -142,7 +142,7 @@ end # Algorithm 3.2 (p15) # step 4, Jₖ is computed only if needed function inverse_cholesky(op::CompressedLBFGS) - view(op.chol_matrix, 1:op.k, 1:op.k) .= op.α .* (transpose(view(op.Sₖ, :, 1:op.k)) * view(op.Sₖ, :, 1:op.k)) .+ view(op.Lₖ, 1:op.k, 1:op.k) * inv(op.Dₖ[1:op.k, 1:op.k]) * transpose(view(op.Lₖ, 1:op.k, 1:op.k)) + view(op.chol_matrix, 1:op.k, 1:op.k) .= op.α .* (transpose(view(op.Sₖ, :, 1:op.k)) * view(op.Sₖ, :, 1:op.k)) .+ view(op.Lₖ, 1:op.k, 1:op.k) * inv(Diagonal(op.Dₖ[1:op.k, 1:op.k])) * transpose(view(op.Lₖ, 1:op.k, 1:op.k)) cholesky!(Symmetric(view(op.chol_matrix, 1:op.k, 1:op.k))) Jₖ = transpose(UpperTriangular(view(op.chol_matrix, 1:op.k, 1:op.k))) return Jₖ diff --git a/test/runtests.jl b/test/runtests.jl index b617a0ca..9538c177 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -7,6 +7,7 @@ include("test_linop_allocs.jl") include("test_adjtrans.jl") include("test_cat.jl") include("test_lbfgs.jl") +include("test_clbfgs.jl") include("test_lsr1.jl") include("test_kron.jl") include("test_callable.jl") diff --git a/test/test_clbfgs.jl b/test/test_clbfgs.jl new file mode 100644 index 00000000..3f5f6de5 --- /dev/null +++ b/test/test_clbfgs.jl @@ -0,0 +1,17 @@ +@testset "CompressedLBFGS operator" begin + iter=50 + n=100 + n=5 + lbfgs = CompressedLBFGS(n) # m=5 + Bv = rand(n) + for i in 1:iter + s = rand(n) + y = rand(n) + push!(lbfgs, s, y) + # warmp-up computing the mandatory intermediate structures + mul!(Bv, lbfgs, s) + allocs = @allocated mul!(Bv, lbfgs, s) + @test allocs == 0 + @test Bv ≈ y + end +end From 28fdfc0e17631ed5437d013dac28f33c536d749c Mon Sep 17 00:00:00 2001 From: "RAYNAUD Paul (raynaudp)" Date: Wed, 18 Jan 2023 12:16:12 +0200 Subject: [PATCH 06/16] Reduce allocations in push! + modify tests --- src/compressed_lbfgs.jl | 81 +++++++++++++++++++++++++---------------- test/test_clbfgs.jl | 10 +++-- 2 files changed, 56 insertions(+), 35 deletions(-) diff --git a/src/compressed_lbfgs.jl b/src/compressed_lbfgs.jl index e17669cf..55616949 100644 --- a/src/compressed_lbfgs.jl +++ b/src/compressed_lbfgs.jl @@ -36,7 +36,6 @@ In addition to this structures which are circurlarly update when `k` reaches `m` - `inverse_intermediate_2`: a R²ᵏˣ²ᵏ matrix; - `intermediary_vector`: a vector ∈ Rᵏ to store intermediate solutions; - `sol`: a vector ∈ Rᵏ to store intermediate solutions; -- `intermediate_structure_updated`: inform if the intermediate structures are up-to-date or not. This implementation is designed to work either on CPU or GPU. """ mutable struct CompressedLBFGS{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}} @@ -50,18 +49,23 @@ mutable struct CompressedLBFGS{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}} Lₖ::LowerTriangular{T,M} # m * m chol_matrix::M # 2m * 2m + intermediate_diagonal::Diagonal{T,V} # m * m intermediate_1::UpperTriangular{T,M} # 2m * 2m intermediate_2::LowerTriangular{T,M} # 2m * 2m inverse_intermediate_1::UpperTriangular{T,M} # 2m * 2m inverse_intermediate_2::LowerTriangular{T,M} # 2m * 2m intermediary_vector::V # 2m sol::V # m - intermediate_structure_updated::Bool end default_gpu() = CUDA.functional() ? true : false -default_matrix_type(gpu::Bool, T::DataType) = gpu ? CuMatrix{T} : Matrix{T} -default_vector_type(gpu::Bool, T::DataType) = gpu ? CuVector{T} : Vector{T} +default_matrix_type(gpu::Bool; T::DataType=Float64) = gpu ? CuMatrix{T} : Matrix{T} +default_vector_type(gpu::Bool; T::DataType=Float64) = gpu ? CuVector{T} : Vector{T} + +function columnshift!(A::AbstractMatrix{T}; direction::Int=-1, indicemax::Int=size(A)[1]) where T + map(i-> view(A,:,i+direction) .= view(A,:,i), 1-direction:indicemax) + return A +end """ CompressedLBFGS(n::Int; [T=Float64, m=5], gpu:Bool) @@ -69,35 +73,34 @@ default_vector_type(gpu::Bool, T::DataType) = gpu ? CuVector{T} : Vector{T} A implementation of a LBFGS operator (forward), representing a `nxn` linear application. It considers at most `k` BFGS iterates, and fit the architecture depending if it is launched on a CPU or a GPU. """ -function CompressedLBFGS(n::Int; m::Int=5, T=Float64, gpu=default_gpu(), M=default_matrix_type(gpu, T), V=default_vector_type(gpu, T)) +function CompressedLBFGS(n::Int; m::Int=5, T=Float64, gpu=default_gpu(), M=default_matrix_type(gpu; T), V=default_vector_type(gpu; T)) α = (T)(1) k = 0 Sₖ = M(undef, n, m) Yₖ = M(undef, n, m) Dₖ = Diagonal(V(undef, m)) Lₖ = LowerTriangular(M(undef, m, m)) + Lₖ .= (T)(0) chol_matrix = M(undef, m, m) + intermediate_diagonal = Diagonal(V(undef, m)) intermediate_1 = UpperTriangular(M(undef, 2*m, 2*m)) intermediate_2 = LowerTriangular(M(undef, 2*m, 2*m)) inverse_intermediate_1 = UpperTriangular(M(undef, 2*m, 2*m)) inverse_intermediate_2 = LowerTriangular(M(undef, 2*m, 2*m)) intermediary_vector = V(undef, 2*m) sol = V(undef, 2*m) - intermediate_structure_updated = false - return CompressedLBFGS{T,M,V}(m, n, k, α, Sₖ, Yₖ, Dₖ, Lₖ, chol_matrix, intermediate_1, intermediate_2, inverse_intermediate_1, inverse_intermediate_2, intermediary_vector, sol, intermediate_structure_updated) + return CompressedLBFGS{T,M,V}(m, n, k, α, Sₖ, Yₖ, Dₖ, Lₖ, chol_matrix, intermediate_diagonal, intermediate_1, intermediate_2, inverse_intermediate_1, inverse_intermediate_2, intermediary_vector, sol) end function Base.push!(op::CompressedLBFGS{T,M,V}, s::V, y::V) where {T,M,V<:AbstractVector{T}} if op.k < op.m # still some place in the structures op.k += 1 - op.Sₖ[:, op.k] .= s - op.Yₖ[:, op.k] .= y - op.Dₖ.diag[op.k] = dot(s, y) - op.Lₖ.data[op.k, op.k] = 0 - for i in 1:op.k-1 - op.Lₖ.data[op.k, i] = dot(op.Sₖ[:, op.k], op.Yₖ[:, i]) - end + view(op.Sₖ, :, op.k) .= s + view(op.Yₖ, :, op.k) .= y + view(op.Dₖ.diag, op.k) .= dot(s, y) + mul!(view(op.Lₖ.data, op.k, 1:op.k-1), transpose(view(op.Yₖ, :, 1:op.k-1)), view(op.Sₖ, :, op.k) ) + else # k == m update circurlarly the intermediary structures op.Sₖ .= circshift(op.Sₖ, (0, -1)) op.Yₖ .= circshift(op.Yₖ, (0, -1)) @@ -109,13 +112,16 @@ function Base.push!(op::CompressedLBFGS{T,M,V}, s::V, y::V) where {T,M,V<:Abstra # for the time being, reinstantiate completely the Lₖ matrix for j in 1:op.k for i in 1:j-1 - op.Lₖ.data[j, i] = dot(op.Sₖ[:, j], op.Yₖ[:, i]) + op.Lₖ.data[j, i] = dot(view(op.Sₖ,:, j), view(op.Yₖ, :, i)) end end end + + # step 4 and 6 + precompile_iterated_structure!(op) + # secant equation fails if uncommented # op.α = dot(y,s)/dot(s,s) - op.intermediate_structure_updated = false return op end @@ -141,8 +147,18 @@ end # Algorithm 3.2 (p15) # step 4, Jₖ is computed only if needed -function inverse_cholesky(op::CompressedLBFGS) - view(op.chol_matrix, 1:op.k, 1:op.k) .= op.α .* (transpose(view(op.Sₖ, :, 1:op.k)) * view(op.Sₖ, :, 1:op.k)) .+ view(op.Lₖ, 1:op.k, 1:op.k) * inv(Diagonal(op.Dₖ[1:op.k, 1:op.k])) * transpose(view(op.Lₖ, 1:op.k, 1:op.k)) +function inverse_cholesky(op::CompressedLBFGS{T,M,V}) where {T,M,V} + view(op.intermediate_diagonal.diag, 1:op.k) .= inv.(view(op.Dₖ.diag, 1:op.k)) + + # view(op.Lₖ, 1:op.k, 1:op.k) * inv(Diagonal(op.Dₖ[1:op.k, 1:op.k])) * transpose(view(op.Lₖ, 1:op.k, 1:op.k)) + mul!(view(op.inverse_intermediate_1, 1:op.k, 1:op.k), view(op.intermediate_diagonal, 1:op.k, 1:op.k), transpose(view(op.Lₖ, 1:op.k, 1:op.k))) + mul!(view(op.chol_matrix, 1:op.k, 1:op.k), view(op.Lₖ, 1:op.k, 1:op.k), view(op.inverse_intermediate_1, 1:op.k, 1:op.k)) + + # view(op.chol_matrix, 1:op.k, 1:op.k) .= op.α .* (transpose(view(op.Sₖ, :, 1:op.k)) * view(op.Sₖ, :, 1:op.k)) + mul!(view(op.chol_matrix, 1:op.k, 1:op.k), transpose(view(op.Sₖ, :, 1:op.k)), view(op.Sₖ, :, 1:op.k), op.α, (T)(1)) + + # view(op.chol_matrix, 1:op.k, 1:op.k) .= op.α .* (transpose(view(op.Sₖ, :, 1:op.k)) * view(op.Sₖ, :, 1:op.k)) .+ view(op.Lₖ, 1:op.k, 1:op.k) * inv(Diagonal(op.Dₖ[1:op.k, 1:op.k])) * transpose(view(op.Lₖ, 1:op.k, 1:op.k)) + cholesky!(Symmetric(view(op.chol_matrix, 1:op.k, 1:op.k))) Jₖ = transpose(UpperTriangular(view(op.chol_matrix, 1:op.k, 1:op.k))) return Jₖ @@ -152,29 +168,32 @@ end function precompile_iterated_structure!(op::CompressedLBFGS) Jₖ = inverse_cholesky(op) - view(op.intermediate_1, 1:op.k,1:op.k) .= .- view(op.Dₖ, 1:op.k, 1:op.k)^(1/2) - view(op.intermediate_1, 1:op.k,op.k+1:2*op.k) .= view(op.Dₖ, 1:op.k, 1:op.k)^(-1/2) * transpose(view(op.Lₖ, 1:op.k, 1:op.k)) + # constant update view(op.intermediate_1, op.k+1:2*op.k, 1:op.k) .= 0 - view(op.intermediate_1, op.k+1:2*op.k, op.k+1:2*op.k) .= transpose(Jₖ) - - view(op.intermediate_2, 1:op.k, 1:op.k) .= view(op.Dₖ, 1:op.k, 1:op.k)^(1/2) view(op.intermediate_2, 1:op.k, op.k+1:2*op.k) .= 0 - view(op.intermediate_2, op.k+1:2*op.k, 1:op.k) .= .- view(op.Lₖ, 1:op.k, 1:op.k) * view(op.Dₖ, 1:op.k, 1:op.k)^(-1/2) + view(op.intermediate_1, op.k+1:2*op.k, op.k+1:2*op.k) .= transpose(Jₖ) view(op.intermediate_2, op.k+1:2*op.k, op.k+1:2*op.k) .= Jₖ + # updates related to D^(1/2) + view(op.intermediate_diagonal.diag, 1:op.k) .= sqrt.(view(op.Dₖ.diag, 1:op.k)) + view(op.intermediate_1, 1:op.k,1:op.k) .= .- view(op.intermediate_diagonal, 1:op.k, 1:op.k) + view(op.intermediate_2, 1:op.k, 1:op.k) .= view(op.intermediate_diagonal, 1:op.k, 1:op.k) + + # updates related to D^(-1/2) + view(op.intermediate_diagonal.diag, 1:op.k) .= (x -> 1/sqrt(x)).(view(op.Dₖ.diag, 1:op.k)) + mul!(view(op.intermediate_1, 1:op.k,op.k+1:2*op.k), view(op.intermediate_diagonal, 1:op.k, 1:op.k), transpose(view(op.Lₖ, 1:op.k, 1:op.k))) + # view(op.intermediate_1, 1:op.k,op.k+1:2*op.k) .= view(op.Dₖ, 1:op.k, 1:op.k)^(-1/2) * transpose(view(op.Lₖ, 1:op.k, 1:op.k)) + mul!(view(op.intermediate_2, op.k+1:2*op.k, 1:op.k), view(op.Lₖ, 1:op.k, 1:op.k), view(op.intermediate_diagonal, 1:op.k, 1:op.k)) + view(op.intermediate_2, op.k+1:2*op.k, 1:op.k) .= view(op.intermediate_2, op.k+1:2*op.k, 1:op.k) .* -1 + # view(op.intermediate_2, op.k+1:2*op.k, 1:op.k) .= .- view(op.Lₖ, 1:op.k, 1:op.k) * view(op.Dₖ, 1:op.k, 1:op.k)^(-1/2) + view(op.inverse_intermediate_1, 1:2*op.k, 1:2*op.k) .= inv(op.intermediate_1[1:2*op.k, 1:2*op.k]) view(op.inverse_intermediate_2, 1:2*op.k, 1:2*op.k) .= inv(op.intermediate_2[1:2*op.k, 1:2*op.k]) - - op.intermediate_structure_updated = true end # Algorithm 3.2 (p15) function LinearAlgebra.mul!(Bv::V, op::CompressedLBFGS{T,M,V}, v::V) where {T,M,V<:AbstractVector{T}} - # step 1-3 mainly done by Base.push! - - # steps 4 and 6, in case the intermediary structures required are not up to date - (!op.intermediate_structure_updated) && (precompile_iterated_structure!(op)) - + # step 1-4 and 6 mainly done by Base.push! # step 5 mul!(view(op.sol, 1:op.k), transpose(view(op.Yₖ, :, 1:op.k)), v) mul!(view(op.sol, op.k+1:2*op.k), transpose(view(op.Sₖ, :, 1:op.k)), v) diff --git a/test/test_clbfgs.jl b/test/test_clbfgs.jl index 3f5f6de5..dc929d7f 100644 --- a/test/test_clbfgs.jl +++ b/test/test_clbfgs.jl @@ -3,13 +3,15 @@ n=100 n=5 lbfgs = CompressedLBFGS(n) # m=5 - Bv = rand(n) + V = LinearOperators.default_vector_type(LinearOperators.default_gpu()) + Bv = V(rand(n)) + s = V(rand(n)) + mul!(Bv, lbfgs, s) # warm-up for i in 1:iter - s = rand(n) - y = rand(n) + s = V(rand(n)) + y = V(rand(n)) push!(lbfgs, s, y) # warmp-up computing the mandatory intermediate structures - mul!(Bv, lbfgs, s) allocs = @allocated mul!(Bv, lbfgs, s) @test allocs == 0 @test Bv ≈ y From 050d219b903cac305ab4415d7cadd9797343f870 Mon Sep 17 00:00:00 2001 From: "RAYNAUD Paul (raynaudp)" Date: Wed, 18 Jan 2023 15:48:50 +0200 Subject: [PATCH 07/16] improve circular shift in push! (faster update than LBFGSOperator) --- src/compressed_lbfgs.jl | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/src/compressed_lbfgs.jl b/src/compressed_lbfgs.jl index 55616949..e0d7f940 100644 --- a/src/compressed_lbfgs.jl +++ b/src/compressed_lbfgs.jl @@ -67,6 +67,11 @@ function columnshift!(A::AbstractMatrix{T}; direction::Int=-1, indicemax::Int=si return A end +function columnshift!(A::AbstractMatrix{T}; direction::Int=-1, indicemax::Int=size(A)[1]) where T + map(i-> view(A,:,i+direction) .= view(A,:,i), 1-direction:indicemax) + return A +end + """ CompressedLBFGS(n::Int; [T=Float64, m=5], gpu:Bool) @@ -100,21 +105,16 @@ function Base.push!(op::CompressedLBFGS{T,M,V}, s::V, y::V) where {T,M,V<:Abstra view(op.Yₖ, :, op.k) .= y view(op.Dₖ.diag, op.k) .= dot(s, y) mul!(view(op.Lₖ.data, op.k, 1:op.k-1), transpose(view(op.Yₖ, :, 1:op.k-1)), view(op.Sₖ, :, op.k) ) - else # k == m update circurlarly the intermediary structures - op.Sₖ .= circshift(op.Sₖ, (0, -1)) - op.Yₖ .= circshift(op.Yₖ, (0, -1)) + columnshift!(op.Sₖ; indicemax=op.k) + columnshift!(op.Yₖ; indicemax=op.k) op.Dₖ .= circshift(op.Dₖ, (-1, -1)) - op.Sₖ[:, op.k] .= s - op.Yₖ[:, op.k] .= y - op.Dₖ.diag[op.k] = dot(s, y) - # circshift doesn't work for a LowerTriangular matrix - # for the time being, reinstantiate completely the Lₖ matrix - for j in 1:op.k - for i in 1:j-1 - op.Lₖ.data[j, i] = dot(view(op.Sₖ,:, j), view(op.Yₖ, :, i)) - end - end + view(op.Sₖ, :, op.k) .= s + view(op.Yₖ, :, op.k) .= y + view(op.Dₖ.diag, op.k) .= dot(s, y) + + map(i-> view(op.Lₖ, i:op.m-1, i-1) .= view(op.Lₖ, i+1:op.m, i), 2:op.m) + mul!(view(op.Lₖ.data, op.k, 1:op.k-1), transpose(view(op.Yₖ, :, 1:op.k-1)), view(op.Sₖ, :, op.k) ) end # step 4 and 6 @@ -150,15 +150,11 @@ end function inverse_cholesky(op::CompressedLBFGS{T,M,V}) where {T,M,V} view(op.intermediate_diagonal.diag, 1:op.k) .= inv.(view(op.Dₖ.diag, 1:op.k)) - # view(op.Lₖ, 1:op.k, 1:op.k) * inv(Diagonal(op.Dₖ[1:op.k, 1:op.k])) * transpose(view(op.Lₖ, 1:op.k, 1:op.k)) mul!(view(op.inverse_intermediate_1, 1:op.k, 1:op.k), view(op.intermediate_diagonal, 1:op.k, 1:op.k), transpose(view(op.Lₖ, 1:op.k, 1:op.k))) mul!(view(op.chol_matrix, 1:op.k, 1:op.k), view(op.Lₖ, 1:op.k, 1:op.k), view(op.inverse_intermediate_1, 1:op.k, 1:op.k)) - # view(op.chol_matrix, 1:op.k, 1:op.k) .= op.α .* (transpose(view(op.Sₖ, :, 1:op.k)) * view(op.Sₖ, :, 1:op.k)) mul!(view(op.chol_matrix, 1:op.k, 1:op.k), transpose(view(op.Sₖ, :, 1:op.k)), view(op.Sₖ, :, 1:op.k), op.α, (T)(1)) - # view(op.chol_matrix, 1:op.k, 1:op.k) .= op.α .* (transpose(view(op.Sₖ, :, 1:op.k)) * view(op.Sₖ, :, 1:op.k)) .+ view(op.Lₖ, 1:op.k, 1:op.k) * inv(Diagonal(op.Dₖ[1:op.k, 1:op.k])) * transpose(view(op.Lₖ, 1:op.k, 1:op.k)) - cholesky!(Symmetric(view(op.chol_matrix, 1:op.k, 1:op.k))) Jₖ = transpose(UpperTriangular(view(op.chol_matrix, 1:op.k, 1:op.k))) return Jₖ @@ -182,10 +178,8 @@ function precompile_iterated_structure!(op::CompressedLBFGS) # updates related to D^(-1/2) view(op.intermediate_diagonal.diag, 1:op.k) .= (x -> 1/sqrt(x)).(view(op.Dₖ.diag, 1:op.k)) mul!(view(op.intermediate_1, 1:op.k,op.k+1:2*op.k), view(op.intermediate_diagonal, 1:op.k, 1:op.k), transpose(view(op.Lₖ, 1:op.k, 1:op.k))) - # view(op.intermediate_1, 1:op.k,op.k+1:2*op.k) .= view(op.Dₖ, 1:op.k, 1:op.k)^(-1/2) * transpose(view(op.Lₖ, 1:op.k, 1:op.k)) mul!(view(op.intermediate_2, op.k+1:2*op.k, 1:op.k), view(op.Lₖ, 1:op.k, 1:op.k), view(op.intermediate_diagonal, 1:op.k, 1:op.k)) view(op.intermediate_2, op.k+1:2*op.k, 1:op.k) .= view(op.intermediate_2, op.k+1:2*op.k, 1:op.k) .* -1 - # view(op.intermediate_2, op.k+1:2*op.k, 1:op.k) .= .- view(op.Lₖ, 1:op.k, 1:op.k) * view(op.Dₖ, 1:op.k, 1:op.k)^(-1/2) view(op.inverse_intermediate_1, 1:2*op.k, 1:2*op.k) .= inv(op.intermediate_1[1:2*op.k, 1:2*op.k]) view(op.inverse_intermediate_2, 1:2*op.k, 1:2*op.k) .= inv(op.intermediate_2[1:2*op.k, 1:2*op.k]) @@ -200,12 +194,10 @@ function LinearAlgebra.mul!(Bv::V, op::CompressedLBFGS{T,M,V}, v::V) where {T,M, # scal!(op.α, view(op.sol, op.k+1:2*op.k)) # more allocation, slower view(op.sol, op.k+1:2*op.k) .*= op.α - # view(op.sol, 1:2*op.k) .= view(op.inverse_intermediate_1, 1:2*op.k, 1:2*op.k) * (view(op.inverse_intermediate_2, 1:2*op.k, 1:2*op.k) * view(op.sol, 1:2*op.k)) mul!(view(op.intermediary_vector, 1:2*op.k), view(op.inverse_intermediate_2, 1:2*op.k, 1:2*op.k), view(op.sol, 1:2*op.k)) mul!(view(op.sol, 1:2*op.k), view(op.inverse_intermediate_1, 1:2*op.k, 1:2*op.k), view(op.intermediary_vector, 1:2*op.k)) # step 7 - # Bv .= op.α .* v .- (view(op.Yₖ, :,1:op.k) * view(op.sol, 1:op.k) .+ op.α .* view(op.Sₖ, :, 1:op.k) * view(op.sol, op.k+1:2*op.k)) mul!(Bv, view(op.Yₖ, :, 1:op.k), view(op.sol, 1:op.k)) mul!(Bv, view(op.Sₖ, :, 1:op.k), view(op.sol, op.k+1:2*op.k), - op.α, (T)(-1)) Bv .+= op.α .* v From f0884c84c29cb4c283c198854067e8ee2ce8014a Mon Sep 17 00:00:00 2001 From: "RAYNAUD Paul (raynaudp)" Date: Wed, 18 Jan 2023 16:07:59 +0200 Subject: [PATCH 08/16] m -> mem, CompressedLBFGS -> CompressedLBFGSOperator + add vectorshift! --- src/compressed_lbfgs.jl | 79 +++++++++++++++++++++-------------------- test/test_clbfgs.jl | 5 ++- 2 files changed, 42 insertions(+), 42 deletions(-) diff --git a/src/compressed_lbfgs.jl b/src/compressed_lbfgs.jl index e0d7f940..5407ebd4 100644 --- a/src/compressed_lbfgs.jl +++ b/src/compressed_lbfgs.jl @@ -10,17 +10,17 @@ Implemented by Paul Raynaud (supervised by Dominique Orban) using LinearAlgebra, LinearAlgebra.BLAS using CUDA -export CompressedLBFGS +export CompressedLBFGSOperator """ - CompressedLBFGS{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}} + CompressedLBFGSOperator{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}} A LBFGS limited-memory operator. -It represents a linear application Rⁿˣⁿ, considering at most `m` BFGS updates. +It represents a linear application Rⁿˣⁿ, considering at most `mem` BFGS updates. This implementation considers the bloc matrices reoresentation of the BFGS (forward) update. It follows the algorithm described in [REPRESENTATIONS OF QUASI-NEWTON MATRICES AND THEIR USE IN LIMITED MEMORY METHODS](https://link.springer.com/article/10.1007/BF01582063) from Richard H. Byrd, Jorge Nocedal and Robert B. Schnabel (1994). This operator considers several fields directly related to the bloc representation of the operator: -- `m`: the maximal memory of the operator; +- `mem`: the maximal memory of the operator; - `n`: the dimension of the linear application; - `k`: the current memory's size of the operator; - `α`: scalar for `B₀ = α I`; @@ -28,7 +28,7 @@ This operator considers several fields directly related to the bloc representati - `Yₖ`: retain the `k`-th last vectors `y` from the updates parametrized by `(s,y)`;; - `Dₖ`: a diagonal matrix mandatory to perform the linear application and to form the matrix; - `Lₖ`: a lower diagonal mandatory to perform the linear application and to form the matrix. -In addition to this structures which are circurlarly update when `k` reaches `m`, we consider other intermediate data structures renew at each update: +In addition to this structures which are circurlarly update when `k` reaches `mem`, we consider other intermediate data structures renew at each update: - `chol_matrix`: a matrix required to store a Cholesky factorization of a Rᵏˣᵏ matrix; - `intermediate_1`: a R²ᵏˣ²ᵏ matrix; - `intermediate_2`: a R²ᵏˣ²ᵏ matrix; @@ -38,24 +38,24 @@ In addition to this structures which are circurlarly update when `k` reaches `m` - `sol`: a vector ∈ Rᵏ to store intermediate solutions; This implementation is designed to work either on CPU or GPU. """ -mutable struct CompressedLBFGS{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}} - m::Int # memory of the operator +mutable struct CompressedLBFGSOperator{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}} + mem::Int # memory of the operator n::Int # vector size - k::Int # k ≤ m, active memory of the operator + k::Int # k ≤ mem, active memory of the operator α::T # B₀ = αI Sₖ::M # gather all sₖ₋ₘ Yₖ::M # gather all yₖ₋ₘ - Dₖ::Diagonal{T,V} # m * m - Lₖ::LowerTriangular{T,M} # m * m + Dₖ::Diagonal{T,V} # mem * mem + Lₖ::LowerTriangular{T,M} # mem * mem chol_matrix::M # 2m * 2m - intermediate_diagonal::Diagonal{T,V} # m * m + intermediate_diagonal::Diagonal{T,V} # mem * mem intermediate_1::UpperTriangular{T,M} # 2m * 2m intermediate_2::LowerTriangular{T,M} # 2m * 2m inverse_intermediate_1::UpperTriangular{T,M} # 2m * 2m inverse_intermediate_2::LowerTriangular{T,M} # 2m * 2m intermediary_vector::V # 2m - sol::V # m + sol::V # mem end default_gpu() = CUDA.functional() ? true : false @@ -67,53 +67,54 @@ function columnshift!(A::AbstractMatrix{T}; direction::Int=-1, indicemax::Int=si return A end -function columnshift!(A::AbstractMatrix{T}; direction::Int=-1, indicemax::Int=size(A)[1]) where T - map(i-> view(A,:,i+direction) .= view(A,:,i), 1-direction:indicemax) - return A +function vectorshift!(v::AbstractVector{T}; direction::Int=-1, indicemax::Int=length(v)) where T + view(v, 1:indicemax+direction) .= view(v,1-direction:indicemax) + return v end """ - CompressedLBFGS(n::Int; [T=Float64, m=5], gpu:Bool) + CompressedLBFGSOperator(n::Int; [T=Float64, mem=5], gpu:Bool) A implementation of a LBFGS operator (forward), representing a `nxn` linear application. It considers at most `k` BFGS iterates, and fit the architecture depending if it is launched on a CPU or a GPU. """ -function CompressedLBFGS(n::Int; m::Int=5, T=Float64, gpu=default_gpu(), M=default_matrix_type(gpu; T), V=default_vector_type(gpu; T)) +function CompressedLBFGSOperator(n::Int; mem::Int=5, T=Float64, gpu=default_gpu(), M=default_matrix_type(gpu; T), V=default_vector_type(gpu; T)) α = (T)(1) k = 0 - Sₖ = M(undef, n, m) - Yₖ = M(undef, n, m) - Dₖ = Diagonal(V(undef, m)) - Lₖ = LowerTriangular(M(undef, m, m)) + Sₖ = M(undef, n, mem) + Yₖ = M(undef, n, mem) + Dₖ = Diagonal(V(undef, mem)) + Lₖ = LowerTriangular(M(undef, mem, mem)) Lₖ .= (T)(0) - chol_matrix = M(undef, m, m) - intermediate_diagonal = Diagonal(V(undef, m)) - intermediate_1 = UpperTriangular(M(undef, 2*m, 2*m)) - intermediate_2 = LowerTriangular(M(undef, 2*m, 2*m)) - inverse_intermediate_1 = UpperTriangular(M(undef, 2*m, 2*m)) - inverse_intermediate_2 = LowerTriangular(M(undef, 2*m, 2*m)) - intermediary_vector = V(undef, 2*m) - sol = V(undef, 2*m) - return CompressedLBFGS{T,M,V}(m, n, k, α, Sₖ, Yₖ, Dₖ, Lₖ, chol_matrix, intermediate_diagonal, intermediate_1, intermediate_2, inverse_intermediate_1, inverse_intermediate_2, intermediary_vector, sol) + chol_matrix = M(undef, mem, mem) + intermediate_diagonal = Diagonal(V(undef, mem)) + intermediate_1 = UpperTriangular(M(undef, 2*mem, 2*mem)) + intermediate_2 = LowerTriangular(M(undef, 2*mem, 2*mem)) + inverse_intermediate_1 = UpperTriangular(M(undef, 2*mem, 2*mem)) + inverse_intermediate_2 = LowerTriangular(M(undef, 2*mem, 2*mem)) + intermediary_vector = V(undef, 2*mem) + sol = V(undef, 2*mem) + return CompressedLBFGSOperator{T,M,V}(mem, n, k, α, Sₖ, Yₖ, Dₖ, Lₖ, chol_matrix, intermediate_diagonal, intermediate_1, intermediate_2, inverse_intermediate_1, inverse_intermediate_2, intermediary_vector, sol) end -function Base.push!(op::CompressedLBFGS{T,M,V}, s::V, y::V) where {T,M,V<:AbstractVector{T}} - if op.k < op.m # still some place in the structures +function Base.push!(op::CompressedLBFGSOperator{T,M,V}, s::V, y::V) where {T,M,V<:AbstractVector{T}} + if op.k < op.mem # still some place in the structures op.k += 1 view(op.Sₖ, :, op.k) .= s view(op.Yₖ, :, op.k) .= y view(op.Dₖ.diag, op.k) .= dot(s, y) mul!(view(op.Lₖ.data, op.k, 1:op.k-1), transpose(view(op.Yₖ, :, 1:op.k-1)), view(op.Sₖ, :, op.k) ) - else # k == m update circurlarly the intermediary structures + else # k == mem update circurlarly the intermediary structures columnshift!(op.Sₖ; indicemax=op.k) columnshift!(op.Yₖ; indicemax=op.k) - op.Dₖ .= circshift(op.Dₖ, (-1, -1)) + # op.Dₖ .= circshift(op.Dₖ, (-1, -1)) + vectorshift!(op.Dₖ.diag; indicemax=op.k) view(op.Sₖ, :, op.k) .= s view(op.Yₖ, :, op.k) .= y view(op.Dₖ.diag, op.k) .= dot(s, y) - map(i-> view(op.Lₖ, i:op.m-1, i-1) .= view(op.Lₖ, i+1:op.m, i), 2:op.m) + map(i-> view(op.Lₖ, i:op.mem-1, i-1) .= view(op.Lₖ, i+1:op.mem, i), 2:op.mem) mul!(view(op.Lₖ.data, op.k, 1:op.k-1), transpose(view(op.Yₖ, :, 1:op.k-1)), view(op.Sₖ, :, op.k) ) end @@ -127,7 +128,7 @@ end # Algorithm 3.2 (p15) # Theorem 2.3 (p6) -function Base.Matrix(op::CompressedLBFGS{T,M,V}) where {T,M,V} +function Base.Matrix(op::CompressedLBFGSOperator{T,M,V}) where {T,M,V} B₀ = M(zeros(T, op.n, op.n)) map(i -> B₀[i, i] = op.α, 1:op.n) @@ -147,7 +148,7 @@ end # Algorithm 3.2 (p15) # step 4, Jₖ is computed only if needed -function inverse_cholesky(op::CompressedLBFGS{T,M,V}) where {T,M,V} +function inverse_cholesky(op::CompressedLBFGSOperator{T,M,V}) where {T,M,V} view(op.intermediate_diagonal.diag, 1:op.k) .= inv.(view(op.Dₖ.diag, 1:op.k)) mul!(view(op.inverse_intermediate_1, 1:op.k, 1:op.k), view(op.intermediate_diagonal, 1:op.k, 1:op.k), transpose(view(op.Lₖ, 1:op.k, 1:op.k))) @@ -161,7 +162,7 @@ function inverse_cholesky(op::CompressedLBFGS{T,M,V}) where {T,M,V} end # step 6, must be improve -function precompile_iterated_structure!(op::CompressedLBFGS) +function precompile_iterated_structure!(op::CompressedLBFGSOperator) Jₖ = inverse_cholesky(op) # constant update @@ -186,7 +187,7 @@ function precompile_iterated_structure!(op::CompressedLBFGS) end # Algorithm 3.2 (p15) -function LinearAlgebra.mul!(Bv::V, op::CompressedLBFGS{T,M,V}, v::V) where {T,M,V<:AbstractVector{T}} +function LinearAlgebra.mul!(Bv::V, op::CompressedLBFGSOperator{T,M,V}, v::V) where {T,M,V<:AbstractVector{T}} # step 1-4 and 6 mainly done by Base.push! # step 5 mul!(view(op.sol, 1:op.k), transpose(view(op.Yₖ, :, 1:op.k)), v) diff --git a/test/test_clbfgs.jl b/test/test_clbfgs.jl index dc929d7f..265ff5a0 100644 --- a/test/test_clbfgs.jl +++ b/test/test_clbfgs.jl @@ -1,8 +1,8 @@ -@testset "CompressedLBFGS operator" begin +@testset "CompressedLBFGSOperator operator" begin iter=50 n=100 n=5 - lbfgs = CompressedLBFGS(n) # m=5 + lbfgs = CompressedLBFGSOperator(n) # m=5 V = LinearOperators.default_vector_type(LinearOperators.default_gpu()) Bv = V(rand(n)) s = V(rand(n)) @@ -11,7 +11,6 @@ s = V(rand(n)) y = V(rand(n)) push!(lbfgs, s, y) - # warmp-up computing the mandatory intermediate structures allocs = @allocated mul!(Bv, lbfgs, s) @test allocs == 0 @test Bv ≈ y From 4a293ccce6ab2ee8ba8b0b9bb69cf18659cbca7c Mon Sep 17 00:00:00 2001 From: "RAYNAUD Paul (raynaudp)" Date: Tue, 31 Jan 2023 11:10:54 +0200 Subject: [PATCH 09/16] new module ModCompressedLBFGSOperator + remove CUDA + add Requires + fix doc --- Project.toml | 4 ++-- docs/make.jl | 1 + docs/src/reference.md | 2 +- src/compressed_lbfgs.jl | 29 ++++++++++++++++++++++------- test/gpu/nvidia.jl | 2 ++ test/test_clbfgs.jl | 2 +- 6 files changed, 29 insertions(+), 11 deletions(-) diff --git a/Project.toml b/Project.toml index 1fb13163..6eb53e09 100644 --- a/Project.toml +++ b/Project.toml @@ -3,18 +3,18 @@ uuid = "5c8ed15e-5a4c-59e4-a42b-c7e8811fb125" version = "2.5.1" [deps] -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" FastClosures = "9aa1b823-49e4-5ca5-8b0f-3971ec8bab6a" LDLFactorizations = "40e66cde-538c-5869-a4ad-c39174c6795b" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" +Requires = "ae029012-a4dd-5104-9daa-d747884805df" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" [compat] -CUDA = "3.12.1" FastClosures = "0.2, 0.3" LDLFactorizations = "0.9, 0.10" +Requires = "1.3" TimerOutputs = "^0.5" julia = "^1.6.0" diff --git a/docs/make.jl b/docs/make.jl index ff864bf9..5be8d95c 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,4 +1,5 @@ using Documenter, LinearOperators +using LinearOperators.ModCompressedLBFGSOperator makedocs( modules = [LinearOperators], diff --git a/docs/src/reference.md b/docs/src/reference.md index 7f04d9e7..99005987 100644 --- a/docs/src/reference.md +++ b/docs/src/reference.md @@ -13,5 +13,5 @@ Pages = ["reference.md"] ``` ```@autodocs -Modules = [LinearOperators] +Modules = [LinearOperators, ModCompressedLBFGSOperator] ``` diff --git a/src/compressed_lbfgs.jl b/src/compressed_lbfgs.jl index 5407ebd4..32417e65 100644 --- a/src/compressed_lbfgs.jl +++ b/src/compressed_lbfgs.jl @@ -1,3 +1,4 @@ +module ModCompressedLBFGSOperator #= Compressed LBFGS implementation from: REPRESENTATIONS OF QUASI-NEWTON MATRICES AND THEIR USE IN LIMITED MEMORY METHODS @@ -8,9 +9,21 @@ Implemented by Paul Raynaud (supervised by Dominique Orban) =# using LinearAlgebra, LinearAlgebra.BLAS -using CUDA +using Requires + +default_matrix_type(; T::DataType=Float64) = Matrix{T} +default_vector_type(; T::DataType=Float64) = Vector{T} + +@init begin + @require CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" begin + default_matrix_type(; T::DataType=Float64) = CUDA.CuMatrix{T} + default_vector_type(; T::DataType=Float64) = CUDA.CuVector{T} + end + # this scheme may be extended to other GPU modules +end export CompressedLBFGSOperator +export default_matrix_type, default_vector_type """ CompressedLBFGSOperator{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}} @@ -58,10 +71,6 @@ mutable struct CompressedLBFGSOperator{T, M<:AbstractMatrix{T}, V<:AbstractVecto sol::V # mem end -default_gpu() = CUDA.functional() ? true : false -default_matrix_type(gpu::Bool; T::DataType=Float64) = gpu ? CuMatrix{T} : Matrix{T} -default_vector_type(gpu::Bool; T::DataType=Float64) = gpu ? CuVector{T} : Vector{T} - function columnshift!(A::AbstractMatrix{T}; direction::Int=-1, indicemax::Int=size(A)[1]) where T map(i-> view(A,:,i+direction) .= view(A,:,i), 1-direction:indicemax) return A @@ -78,7 +87,7 @@ end A implementation of a LBFGS operator (forward), representing a `nxn` linear application. It considers at most `k` BFGS iterates, and fit the architecture depending if it is launched on a CPU or a GPU. """ -function CompressedLBFGSOperator(n::Int; mem::Int=5, T=Float64, gpu=default_gpu(), M=default_matrix_type(gpu; T), V=default_vector_type(gpu; T)) +function CompressedLBFGSOperator(n::Int; mem::Int=5, T=Float64, M=default_matrix_type(; T), V=default_vector_type(; T)) α = (T)(1) k = 0 Sₖ = M(undef, n, mem) @@ -203,4 +212,10 @@ function LinearAlgebra.mul!(Bv::V, op::CompressedLBFGSOperator{T,M,V}, v::V) whe mul!(Bv, view(op.Sₖ, :, 1:op.k), view(op.sol, op.k+1:2*op.k), - op.α, (T)(-1)) Bv .+= op.α .* v return Bv -end \ No newline at end of file +end + +end + +using ..ModCompressedLBFGSOperator +export CompressedLBFGSOperator +export default_matrix_type, default_vector_type \ No newline at end of file diff --git a/test/gpu/nvidia.jl b/test/gpu/nvidia.jl index bd419924..e9c74c92 100644 --- a/test/gpu/nvidia.jl +++ b/test/gpu/nvidia.jl @@ -14,3 +14,5 @@ using LinearOperators, CUDA, CUDA.CUSPARSE, CUDA.CUSOLVER y = M * v @test y isa CuVector{Float32} end + +include("../test_clbfgs.jl") \ No newline at end of file diff --git a/test/test_clbfgs.jl b/test/test_clbfgs.jl index 265ff5a0..f22a7c5e 100644 --- a/test/test_clbfgs.jl +++ b/test/test_clbfgs.jl @@ -3,7 +3,7 @@ n=100 n=5 lbfgs = CompressedLBFGSOperator(n) # m=5 - V = LinearOperators.default_vector_type(LinearOperators.default_gpu()) + V = LinearOperators.default_vector_type() Bv = V(rand(n)) s = V(rand(n)) mul!(Bv, lbfgs, s) # warm-up From f447c62b5523cf462bac5ef99dadc30e64991263 Mon Sep 17 00:00:00 2001 From: "RAYNAUD Paul (raynaudp)" Date: Tue, 31 Jan 2023 14:04:02 +0200 Subject: [PATCH 10/16] test using CUDA#master --- .buildkite/pipeline.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 814bf484..9a0d19d3 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -9,7 +9,8 @@ steps: command: | julia --color=yes --project -e ' using Pkg - Pkg.add("CUDA") + # Pkg.add("CUDA") + Pkg.add(url="https://github.com/JuliaGPU/CUDA.jl", rev="master") Pkg.instantiate() include("test/gpu/nvidia.jl")' timeout_in_minutes: 30 From cf114e029e5f921d3e02ff639a398d51af8af971 Mon Sep 17 00:00:00 2001 From: "RAYNAUD Paul (raynaudp)" Date: Tue, 31 Jan 2023 15:27:20 +0200 Subject: [PATCH 11/16] fix CUDA types + adapt tests --- src/compressed_lbfgs.jl | 6 +++--- test/test_clbfgs.jl | 29 ++++++++++++++++------------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/src/compressed_lbfgs.jl b/src/compressed_lbfgs.jl index 32417e65..0ba84ed1 100644 --- a/src/compressed_lbfgs.jl +++ b/src/compressed_lbfgs.jl @@ -16,8 +16,8 @@ default_vector_type(; T::DataType=Float64) = Vector{T} @init begin @require CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" begin - default_matrix_type(; T::DataType=Float64) = CUDA.CuMatrix{T} - default_vector_type(; T::DataType=Float64) = CUDA.CuVector{T} + default_matrix_type(; T::DataType=Float64) = CUDA.CuMatrix{T, CUDA.Mem.DeviceBuffer} + default_vector_type(; T::DataType=Float64) = CUDA.CuVector{T, CUDA.Mem.DeviceBuffer} end # this scheme may be extended to other GPU modules end @@ -94,7 +94,7 @@ function CompressedLBFGSOperator(n::Int; mem::Int=5, T=Float64, M=default_matrix Yₖ = M(undef, n, mem) Dₖ = Diagonal(V(undef, mem)) Lₖ = LowerTriangular(M(undef, mem, mem)) - Lₖ .= (T)(0) + Lₖ.data .= (T)(0) chol_matrix = M(undef, mem, mem) intermediate_diagonal = Diagonal(V(undef, mem)) diff --git a/test/test_clbfgs.jl b/test/test_clbfgs.jl index f22a7c5e..4c4e6539 100644 --- a/test/test_clbfgs.jl +++ b/test/test_clbfgs.jl @@ -2,17 +2,20 @@ iter=50 n=100 n=5 - lbfgs = CompressedLBFGSOperator(n) # m=5 - V = LinearOperators.default_vector_type() - Bv = V(rand(n)) - s = V(rand(n)) - mul!(Bv, lbfgs, s) # warm-up - for i in 1:iter - s = V(rand(n)) - y = V(rand(n)) - push!(lbfgs, s, y) - allocs = @allocated mul!(Bv, lbfgs, s) - @test allocs == 0 - @test Bv ≈ y - end + types = [Float32, Float64] + for T in types + lbfgs = CompressedLBFGSOperator(n; T) # mem=5 + V = LinearOperators.default_vector_type(;T) + Bv = V(rand(T, n)) + s = V(rand(T, n)) + mul!(Bv, lbfgs, s) # warm-up + for i in 1:iter + s = V(rand(T, n)) + y = V(rand(T, n)) + push!(lbfgs, s, y) + allocs = @allocated mul!(Bv, lbfgs, s) + @test allocs == 0 + @test Bv ≈ y + end + end end From d80a7d8a474859cd40df6d8e6417175db28e6e81 Mon Sep 17 00:00:00 2001 From: Paul Raynaud Date: Wed, 1 Feb 2023 00:25:13 +0200 Subject: [PATCH 12/16] Update src/compressed_lbfgs.jl Co-authored-by: Alexis Montoison <35051714+amontoison@users.noreply.github.com> --- src/compressed_lbfgs.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compressed_lbfgs.jl b/src/compressed_lbfgs.jl index 0ba84ed1..e7122ed6 100644 --- a/src/compressed_lbfgs.jl +++ b/src/compressed_lbfgs.jl @@ -94,7 +94,7 @@ function CompressedLBFGSOperator(n::Int; mem::Int=5, T=Float64, M=default_matrix Yₖ = M(undef, n, mem) Dₖ = Diagonal(V(undef, mem)) Lₖ = LowerTriangular(M(undef, mem, mem)) - Lₖ.data .= (T)(0) + Lₖ.data .= zero(T) chol_matrix = M(undef, mem, mem) intermediate_diagonal = Diagonal(V(undef, mem)) From 11e3b34866120fa349c0c62a418e658b2c144c27 Mon Sep 17 00:00:00 2001 From: Paul Raynaud Date: Wed, 1 Feb 2023 00:25:24 +0200 Subject: [PATCH 13/16] Update src/compressed_lbfgs.jl Co-authored-by: Alexis Montoison <35051714+amontoison@users.noreply.github.com> --- src/compressed_lbfgs.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compressed_lbfgs.jl b/src/compressed_lbfgs.jl index e7122ed6..f1ea137c 100644 --- a/src/compressed_lbfgs.jl +++ b/src/compressed_lbfgs.jl @@ -138,7 +138,7 @@ end # Algorithm 3.2 (p15) # Theorem 2.3 (p6) function Base.Matrix(op::CompressedLBFGSOperator{T,M,V}) where {T,M,V} - B₀ = M(zeros(T, op.n, op.n)) + B₀ = M(undef, op.n, op.n) map(i -> B₀[i, i] = op.α, 1:op.n) BSY = M(undef, op.n, 2*op.k) From 341332ff2caa13290c1f877eee2d0be7d63513c0 Mon Sep 17 00:00:00 2001 From: Paul Raynaud Date: Wed, 1 Feb 2023 00:25:34 +0200 Subject: [PATCH 14/16] Update src/compressed_lbfgs.jl Co-authored-by: Alexis Montoison <35051714+amontoison@users.noreply.github.com> --- src/compressed_lbfgs.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/compressed_lbfgs.jl b/src/compressed_lbfgs.jl index f1ea137c..d4e844b6 100644 --- a/src/compressed_lbfgs.jl +++ b/src/compressed_lbfgs.jl @@ -148,7 +148,7 @@ function Base.Matrix(op::CompressedLBFGSOperator{T,M,V}) where {T,M,V} (op.k > 0) && (_C[1:op.k, 1:op.k] .= transpose(op.Sₖ[:, 1:op.k]) * op.Sₖ[:, 1:op.k]) (op.k > 0) && (_C[1:op.k, op.k+1:2*op.k] .= op.Lₖ[1:op.k, 1:op.k]) (op.k > 0) && (_C[op.k+1:2*op.k, 1:op.k] .= transpose(op.Lₖ[1:op.k, 1:op.k])) - (op.k > 0) && (_C[op.k+1:2*op.k, op.k+1:2*op.k] .= .- op.Dₖ[1:op.k, 1:op.k]) + (op.k > 0) && (_C[op.k+1:2*op.k, op.k+1:2*op.k] .-= op.Dₖ[1:op.k, 1:op.k]) C = inv(_C) Bₖ = B₀ .- BSY * C * transpose(BSY) From b82c99e8dcca06d730b9d9a2b1a4b8b4bb20dd48 Mon Sep 17 00:00:00 2001 From: "RAYNAUD Paul (raynaudp)" Date: Wed, 1 Feb 2023 00:04:20 +0200 Subject: [PATCH 15/16] apply correction from the review --- docs/make.jl | 1 - docs/src/reference.md | 2 +- src/compressed_lbfgs.jl | 9 +-------- 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/docs/make.jl b/docs/make.jl index 5be8d95c..ff864bf9 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,5 +1,4 @@ using Documenter, LinearOperators -using LinearOperators.ModCompressedLBFGSOperator makedocs( modules = [LinearOperators], diff --git a/docs/src/reference.md b/docs/src/reference.md index 99005987..7f04d9e7 100644 --- a/docs/src/reference.md +++ b/docs/src/reference.md @@ -13,5 +13,5 @@ Pages = ["reference.md"] ``` ```@autodocs -Modules = [LinearOperators, ModCompressedLBFGSOperator] +Modules = [LinearOperators] ``` diff --git a/src/compressed_lbfgs.jl b/src/compressed_lbfgs.jl index d4e844b6..2743da52 100644 --- a/src/compressed_lbfgs.jl +++ b/src/compressed_lbfgs.jl @@ -1,4 +1,3 @@ -module ModCompressedLBFGSOperator #= Compressed LBFGS implementation from: REPRESENTATIONS OF QUASI-NEWTON MATRICES AND THEIR USE IN LIMITED MEMORY METHODS @@ -212,10 +211,4 @@ function LinearAlgebra.mul!(Bv::V, op::CompressedLBFGSOperator{T,M,V}, v::V) whe mul!(Bv, view(op.Sₖ, :, 1:op.k), view(op.sol, op.k+1:2*op.k), - op.α, (T)(-1)) Bv .+= op.α .* v return Bv -end - -end - -using ..ModCompressedLBFGSOperator -export CompressedLBFGSOperator -export default_matrix_type, default_vector_type \ No newline at end of file +end \ No newline at end of file From b211db5d1c5ff8bca70d3eaa004e53d8069db26e Mon Sep 17 00:00:00 2001 From: "RAYNAUD Paul (raynaudp)" Date: Wed, 1 Feb 2023 12:14:18 +0200 Subject: [PATCH 16/16] restore buildkite + add CompressedLBFGData + prod! --- .buildkite/pipeline.yml | 3 +- src/compressed_lbfgs.jl | 244 ++++++++++++++++++++++++---------------- test/runtests.jl | 24 ++-- 3 files changed, 163 insertions(+), 108 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 9a0d19d3..814bf484 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -9,8 +9,7 @@ steps: command: | julia --color=yes --project -e ' using Pkg - # Pkg.add("CUDA") - Pkg.add(url="https://github.com/JuliaGPU/CUDA.jl", rev="master") + Pkg.add("CUDA") Pkg.instantiate() include("test/gpu/nvidia.jl")' timeout_in_minutes: 30 diff --git a/src/compressed_lbfgs.jl b/src/compressed_lbfgs.jl index 2743da52..36f4e27d 100644 --- a/src/compressed_lbfgs.jl +++ b/src/compressed_lbfgs.jl @@ -10,22 +10,32 @@ Implemented by Paul Raynaud (supervised by Dominique Orban) using LinearAlgebra, LinearAlgebra.BLAS using Requires +export CompressedLBFGSOperator, CompressedLBFGSData +# export default_matrix_type, default_vector_type + default_matrix_type(; T::DataType=Float64) = Matrix{T} default_vector_type(; T::DataType=Float64) = Vector{T} @init begin @require CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" begin - default_matrix_type(; T::DataType=Float64) = CUDA.CuMatrix{T, CUDA.Mem.DeviceBuffer} - default_vector_type(; T::DataType=Float64) = CUDA.CuVector{T, CUDA.Mem.DeviceBuffer} + default_matrix_type(; T::DataType=Float64) = CUDA.functional() ? CUDA.CuMatrix{T, CUDA.Mem.DeviceBuffer} : Matrix{T} + default_vector_type(; T::DataType=Float64) = CUDA.functional() ? CUDA.CuVector{T, CUDA.Mem.DeviceBuffer} : Vector{T} end - # this scheme may be extended to other GPU modules + # this scheme may be extended to other GPU backend modules +end + +function columnshift!(A::AbstractMatrix{T}; direction::Int=-1, indicemax::Int=size(A)[1]) where T + map(i-> view(A,:,i+direction) .= view(A,:,i), 1-direction:indicemax) + return A end -export CompressedLBFGSOperator -export default_matrix_type, default_vector_type +function vectorshift!(v::AbstractVector{T}; direction::Int=-1, indicemax::Int=length(v)) where T + view(v, 1:indicemax+direction) .= view(v,1-direction:indicemax) + return v +end """ - CompressedLBFGSOperator{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}} + CompressedLBFGSData{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}} A LBFGS limited-memory operator. It represents a linear application Rⁿˣⁿ, considering at most `mem` BFGS updates. @@ -50,43 +60,35 @@ In addition to this structures which are circurlarly update when `k` reaches `me - `sol`: a vector ∈ Rᵏ to store intermediate solutions; This implementation is designed to work either on CPU or GPU. """ -mutable struct CompressedLBFGSOperator{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}} +mutable struct CompressedLBFGSData{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}, I <: Integer} mem::Int # memory of the operator - n::Int # vector size - k::Int # k ≤ mem, active memory of the operator + n::I # vector size + k::I # k ≤ mem, active memory of the operator α::T # B₀ = αI - Sₖ::M # gather all sₖ₋ₘ - Yₖ::M # gather all yₖ₋ₘ + Sₖ::M # gather all sₖ₋ₘ : n * mem + Yₖ::M # gather all yₖ₋ₘ : n * mem Dₖ::Diagonal{T,V} # mem * mem Lₖ::LowerTriangular{T,M} # mem * mem - chol_matrix::M # 2m * 2m + chol_matrix::M # 2mem * 2mem intermediate_diagonal::Diagonal{T,V} # mem * mem - intermediate_1::UpperTriangular{T,M} # 2m * 2m - intermediate_2::LowerTriangular{T,M} # 2m * 2m - inverse_intermediate_1::UpperTriangular{T,M} # 2m * 2m - inverse_intermediate_2::LowerTriangular{T,M} # 2m * 2m - intermediary_vector::V # 2m - sol::V # mem -end + intermediate_1::UpperTriangular{T,M} # 2mem * 2mem + intermediate_2::LowerTriangular{T,M} # 2mem * 2mem + inverse_intermediate_1::UpperTriangular{T,M} # 2mem * 2mem + inverse_intermediate_2::LowerTriangular{T,M} # 2mem * 2mem + intermediary_vector::V # 2mem + sol::V # 2mem -function columnshift!(A::AbstractMatrix{T}; direction::Int=-1, indicemax::Int=size(A)[1]) where T - map(i-> view(A,:,i+direction) .= view(A,:,i), 1-direction:indicemax) - return A -end - -function vectorshift!(v::AbstractVector{T}; direction::Int=-1, indicemax::Int=length(v)) where T - view(v, 1:indicemax+direction) .= view(v,1-direction:indicemax) - return v + nprod::I end """ - CompressedLBFGSOperator(n::Int; [T=Float64, mem=5], gpu:Bool) + CompressedLBFGSData(n::Int; [T=Float64, mem=5], gpu:Bool) A implementation of a LBFGS operator (forward), representing a `nxn` linear application. It considers at most `k` BFGS iterates, and fit the architecture depending if it is launched on a CPU or a GPU. """ -function CompressedLBFGSOperator(n::Int; mem::Int=5, T=Float64, M=default_matrix_type(; T), V=default_vector_type(; T)) +function CompressedLBFGSData(n::I; mem::I=5, T=Float64, M=default_matrix_type(; T), V=default_vector_type(; T)) where {I<:Integer} α = (T)(1) k = 0 Sₖ = M(undef, n, mem) @@ -103,51 +105,93 @@ function CompressedLBFGSOperator(n::Int; mem::Int=5, T=Float64, M=default_matrix inverse_intermediate_2 = LowerTriangular(M(undef, 2*mem, 2*mem)) intermediary_vector = V(undef, 2*mem) sol = V(undef, 2*mem) - return CompressedLBFGSOperator{T,M,V}(mem, n, k, α, Sₖ, Yₖ, Dₖ, Lₖ, chol_matrix, intermediate_diagonal, intermediate_1, intermediate_2, inverse_intermediate_1, inverse_intermediate_2, intermediary_vector, sol) + + nprod = 0 + + return CompressedLBFGSData{T,M,V,I}(mem, n, k, α, Sₖ, Yₖ, Dₖ, Lₖ, chol_matrix, intermediate_diagonal, intermediate_1, intermediate_2, inverse_intermediate_1, inverse_intermediate_2, intermediary_vector, sol, nprod) +end + +mutable struct CompressedLBFGSOperator{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}, F, I <: Integer} <: AbstractQuasiNewtonOperator{T} + nrow::I + ncol::I + symmetric::Bool + hermitian::Bool + Bv::V + data::CompressedLBFGSData{T,M,V} + prod!::F # apply the operator to a vector + tprod!::F # apply the transpose operator to a vector + ctprod!::F # apply the transpose conjugate operator to a vector +end + +function CompressedLBFGSOperator(n::I; mem::I=5, T=Float64, M=default_matrix_type(; T), V=default_vector_type(; T)) where {I <: Integer} + nrow = n + ncol = n + symmetric = true + hermitian = true + Bv = V(undef, n) + data = CompressedLBFGSData(n; mem, T, M, V) + + prod! = @closure (res, v, α, β) -> begin + mul!(Bv, data, v) + if β == zero(T) + res .= α .* Bv + else + res .= α .* Bv .+ β .* res + end + end + + F = typeof(prod!) + + return CompressedLBFGSOperator{T,M,V,F,I}(nrow, ncol, symmetric, hermitian, Bv, data, prod!, prod!, prod!) end -function Base.push!(op::CompressedLBFGSOperator{T,M,V}, s::V, y::V) where {T,M,V<:AbstractVector{T}} - if op.k < op.mem # still some place in the structures - op.k += 1 - view(op.Sₖ, :, op.k) .= s - view(op.Yₖ, :, op.k) .= y - view(op.Dₖ.diag, op.k) .= dot(s, y) - mul!(view(op.Lₖ.data, op.k, 1:op.k-1), transpose(view(op.Yₖ, :, 1:op.k-1)), view(op.Sₖ, :, op.k) ) +has_args5(op::CompressedLBFGSOperator) = true +use_prod5!(op::CompressedLBFGSOperator) = true + +Base.push!(op::CompressedLBFGSOperator{T,M,V}, s::V, y::V) where {T,M,V<:AbstractVector{T}} = Base.push!(op.data, s, y) +function Base.push!(data::CompressedLBFGSData{T,M,V}, s::V, y::V) where {T,M,V<:AbstractVector{T}} + if data.k < data.mem # still some place in the structures + data.k += 1 + view(data.Sₖ, :, data.k) .= s + view(data.Yₖ, :, data.k) .= y + view(data.Dₖ.diag, data.k) .= dot(s, y) + mul!(view(data.Lₖ.data, data.k, 1:data.k-1), transpose(view(data.Yₖ, :, 1:data.k-1)), view(data.Sₖ, :, data.k) ) else # k == mem update circurlarly the intermediary structures - columnshift!(op.Sₖ; indicemax=op.k) - columnshift!(op.Yₖ; indicemax=op.k) - # op.Dₖ .= circshift(op.Dₖ, (-1, -1)) - vectorshift!(op.Dₖ.diag; indicemax=op.k) - view(op.Sₖ, :, op.k) .= s - view(op.Yₖ, :, op.k) .= y - view(op.Dₖ.diag, op.k) .= dot(s, y) - - map(i-> view(op.Lₖ, i:op.mem-1, i-1) .= view(op.Lₖ, i+1:op.mem, i), 2:op.mem) - mul!(view(op.Lₖ.data, op.k, 1:op.k-1), transpose(view(op.Yₖ, :, 1:op.k-1)), view(op.Sₖ, :, op.k) ) + columnshift!(data.Sₖ; indicemax=data.k) + columnshift!(data.Yₖ; indicemax=data.k) + # data.Dₖ .= circshift(data.Dₖ, (-1, -1)) + vectorshift!(data.Dₖ.diag; indicemax=data.k) + view(data.Sₖ, :, data.k) .= s + view(data.Yₖ, :, data.k) .= y + view(data.Dₖ.diag, data.k) .= dot(s, y) + + map(i-> view(data.Lₖ, i:data.mem-1, i-1) .= view(data.Lₖ, i+1:data.mem, i), 2:data.mem) + mul!(view(data.Lₖ.data, data.k, 1:data.k-1), transpose(view(data.Yₖ, :, 1:data.k-1)), view(data.Sₖ, :, data.k) ) end # step 4 and 6 - precompile_iterated_structure!(op) + precompile_iterated_structure!(data) # secant equation fails if uncommented - # op.α = dot(y,s)/dot(s,s) - return op + # data.α = dot(y,s)/dot(s,s) + return data end # Algorithm 3.2 (p15) # Theorem 2.3 (p6) -function Base.Matrix(op::CompressedLBFGSOperator{T,M,V}) where {T,M,V} - B₀ = M(undef, op.n, op.n) - map(i -> B₀[i, i] = op.α, 1:op.n) - - BSY = M(undef, op.n, 2*op.k) - (op.k > 0) && (BSY[:, 1:op.k] = B₀ * op.Sₖ[:, 1:op.k]) - (op.k > 0) && (BSY[:, op.k+1:2*op.k] = op.Yₖ[:, 1:op.k]) - _C = M(undef, 2*op.k, 2*op.k) - (op.k > 0) && (_C[1:op.k, 1:op.k] .= transpose(op.Sₖ[:, 1:op.k]) * op.Sₖ[:, 1:op.k]) - (op.k > 0) && (_C[1:op.k, op.k+1:2*op.k] .= op.Lₖ[1:op.k, 1:op.k]) - (op.k > 0) && (_C[op.k+1:2*op.k, 1:op.k] .= transpose(op.Lₖ[1:op.k, 1:op.k])) - (op.k > 0) && (_C[op.k+1:2*op.k, op.k+1:2*op.k] .-= op.Dₖ[1:op.k, 1:op.k]) +Base.Matrix(op::CompressedLBFGSOperator{T,M,V}) where {T,M,V} = Base.Matrix(op.data) +function Base.Matrix(data::CompressedLBFGSData{T,M,V}) where {T,M,V} + B₀ = M(undef, data.n, data.n) + map(i -> B₀[i, i] = data.α, 1:data.n) + + BSY = M(undef, data.n, 2*data.k) + (data.k > 0) && (BSY[:, 1:data.k] = B₀ * data.Sₖ[:, 1:data.k]) + (data.k > 0) && (BSY[:, data.k+1:2*data.k] = data.Yₖ[:, 1:data.k]) + _C = M(undef, 2*data.k, 2*data.k) + (data.k > 0) && (_C[1:data.k, 1:data.k] .= transpose(data.Sₖ[:, 1:data.k]) * data.Sₖ[:, 1:data.k]) + (data.k > 0) && (_C[1:data.k, data.k+1:2*data.k] .= data.Lₖ[1:data.k, 1:data.k]) + (data.k > 0) && (_C[data.k+1:2*data.k, 1:data.k] .= transpose(data.Lₖ[1:data.k, 1:data.k])) + (data.k > 0) && (_C[data.k+1:2*data.k, data.k+1:2*data.k] .-= data.Dₖ[1:data.k, 1:data.k]) C = inv(_C) Bₖ = B₀ .- BSY * C * transpose(BSY) @@ -156,59 +200,71 @@ end # Algorithm 3.2 (p15) # step 4, Jₖ is computed only if needed -function inverse_cholesky(op::CompressedLBFGSOperator{T,M,V}) where {T,M,V} - view(op.intermediate_diagonal.diag, 1:op.k) .= inv.(view(op.Dₖ.diag, 1:op.k)) +function inverse_cholesky(data::CompressedLBFGSData{T,M,V}) where {T,M,V} + view(data.intermediate_diagonal.diag, 1:data.k) .= inv.(view(data.Dₖ.diag, 1:data.k)) - mul!(view(op.inverse_intermediate_1, 1:op.k, 1:op.k), view(op.intermediate_diagonal, 1:op.k, 1:op.k), transpose(view(op.Lₖ, 1:op.k, 1:op.k))) - mul!(view(op.chol_matrix, 1:op.k, 1:op.k), view(op.Lₖ, 1:op.k, 1:op.k), view(op.inverse_intermediate_1, 1:op.k, 1:op.k)) + mul!(view(data.inverse_intermediate_1, 1:data.k, 1:data.k), view(data.intermediate_diagonal, 1:data.k, 1:data.k), transpose(view(data.Lₖ, 1:data.k, 1:data.k))) + mul!(view(data.chol_matrix, 1:data.k, 1:data.k), view(data.Lₖ, 1:data.k, 1:data.k), view(data.inverse_intermediate_1, 1:data.k, 1:data.k)) - mul!(view(op.chol_matrix, 1:op.k, 1:op.k), transpose(view(op.Sₖ, :, 1:op.k)), view(op.Sₖ, :, 1:op.k), op.α, (T)(1)) + mul!(view(data.chol_matrix, 1:data.k, 1:data.k), transpose(view(data.Sₖ, :, 1:data.k)), view(data.Sₖ, :, 1:data.k), data.α, (T)(1)) - cholesky!(Symmetric(view(op.chol_matrix, 1:op.k, 1:op.k))) - Jₖ = transpose(UpperTriangular(view(op.chol_matrix, 1:op.k, 1:op.k))) + cholesky!(Symmetric(view(data.chol_matrix, 1:data.k, 1:data.k))) + Jₖ = transpose(UpperTriangular(view(data.chol_matrix, 1:data.k, 1:data.k))) return Jₖ end # step 6, must be improve -function precompile_iterated_structure!(op::CompressedLBFGSOperator) - Jₖ = inverse_cholesky(op) +function precompile_iterated_structure!(data::CompressedLBFGSData) + Jₖ = inverse_cholesky(data) # constant update - view(op.intermediate_1, op.k+1:2*op.k, 1:op.k) .= 0 - view(op.intermediate_2, 1:op.k, op.k+1:2*op.k) .= 0 - view(op.intermediate_1, op.k+1:2*op.k, op.k+1:2*op.k) .= transpose(Jₖ) - view(op.intermediate_2, op.k+1:2*op.k, op.k+1:2*op.k) .= Jₖ + view(data.intermediate_1, data.k+1:2*data.k, 1:data.k) .= 0 + view(data.intermediate_2, 1:data.k, data.k+1:2*data.k) .= 0 + view(data.intermediate_1, data.k+1:2*data.k, data.k+1:2*data.k) .= transpose(Jₖ) + view(data.intermediate_2, data.k+1:2*data.k, data.k+1:2*data.k) .= Jₖ # updates related to D^(1/2) - view(op.intermediate_diagonal.diag, 1:op.k) .= sqrt.(view(op.Dₖ.diag, 1:op.k)) - view(op.intermediate_1, 1:op.k,1:op.k) .= .- view(op.intermediate_diagonal, 1:op.k, 1:op.k) - view(op.intermediate_2, 1:op.k, 1:op.k) .= view(op.intermediate_diagonal, 1:op.k, 1:op.k) + view(data.intermediate_diagonal.diag, 1:data.k) .= sqrt.(view(data.Dₖ.diag, 1:data.k)) + view(data.intermediate_1, 1:data.k,1:data.k) .= .- view(data.intermediate_diagonal, 1:data.k, 1:data.k) + view(data.intermediate_2, 1:data.k, 1:data.k) .= view(data.intermediate_diagonal, 1:data.k, 1:data.k) # updates related to D^(-1/2) - view(op.intermediate_diagonal.diag, 1:op.k) .= (x -> 1/sqrt(x)).(view(op.Dₖ.diag, 1:op.k)) - mul!(view(op.intermediate_1, 1:op.k,op.k+1:2*op.k), view(op.intermediate_diagonal, 1:op.k, 1:op.k), transpose(view(op.Lₖ, 1:op.k, 1:op.k))) - mul!(view(op.intermediate_2, op.k+1:2*op.k, 1:op.k), view(op.Lₖ, 1:op.k, 1:op.k), view(op.intermediate_diagonal, 1:op.k, 1:op.k)) - view(op.intermediate_2, op.k+1:2*op.k, 1:op.k) .= view(op.intermediate_2, op.k+1:2*op.k, 1:op.k) .* -1 + view(data.intermediate_diagonal.diag, 1:data.k) .= (x -> 1/sqrt(x)).(view(data.Dₖ.diag, 1:data.k)) + mul!(view(data.intermediate_1, 1:data.k,data.k+1:2*data.k), view(data.intermediate_diagonal, 1:data.k, 1:data.k), transpose(view(data.Lₖ, 1:data.k, 1:data.k))) + mul!(view(data.intermediate_2, data.k+1:2*data.k, 1:data.k), view(data.Lₖ, 1:data.k, 1:data.k), view(data.intermediate_diagonal, 1:data.k, 1:data.k)) + view(data.intermediate_2, data.k+1:2*data.k, 1:data.k) .= view(data.intermediate_2, data.k+1:2*data.k, 1:data.k) .* -1 - view(op.inverse_intermediate_1, 1:2*op.k, 1:2*op.k) .= inv(op.intermediate_1[1:2*op.k, 1:2*op.k]) - view(op.inverse_intermediate_2, 1:2*op.k, 1:2*op.k) .= inv(op.intermediate_2[1:2*op.k, 1:2*op.k]) + view(data.inverse_intermediate_1, 1:2*data.k, 1:2*data.k) .= inv(data.intermediate_1[1:2*data.k, 1:2*data.k]) + view(data.inverse_intermediate_2, 1:2*data.k, 1:2*data.k) .= inv(data.intermediate_2[1:2*data.k, 1:2*data.k]) end # Algorithm 3.2 (p15) -function LinearAlgebra.mul!(Bv::V, op::CompressedLBFGSOperator{T,M,V}, v::V) where {T,M,V<:AbstractVector{T}} +LinearAlgebra.mul!(Bv::V, op::CompressedLBFGSOperator{T,M,V}, v::V) where {T,M,V<:AbstractVector{T}} = LinearAlgebra.mul!(Bv, op.data, v) +function LinearAlgebra.mul!(Bv::V, data::CompressedLBFGSData{T,M,V}, v::V) where {T,M,V<:AbstractVector{T}} + data.nprod += 1 # step 1-4 and 6 mainly done by Base.push! # step 5 - mul!(view(op.sol, 1:op.k), transpose(view(op.Yₖ, :, 1:op.k)), v) - mul!(view(op.sol, op.k+1:2*op.k), transpose(view(op.Sₖ, :, 1:op.k)), v) - # scal!(op.α, view(op.sol, op.k+1:2*op.k)) # more allocation, slower - view(op.sol, op.k+1:2*op.k) .*= op.α + mul!(view(data.sol, 1:data.k), transpose(view(data.Yₖ, :, 1:data.k)), v) + mul!(view(data.sol, data.k+1:2*data.k), transpose(view(data.Sₖ, :, 1:data.k)), v) + # scal!(data.α, view(data.sol, data.k+1:2*data.k)) # more allocation, slower + view(data.sol, data.k+1:2*data.k) .*= data.α - mul!(view(op.intermediary_vector, 1:2*op.k), view(op.inverse_intermediate_2, 1:2*op.k, 1:2*op.k), view(op.sol, 1:2*op.k)) - mul!(view(op.sol, 1:2*op.k), view(op.inverse_intermediate_1, 1:2*op.k, 1:2*op.k), view(op.intermediary_vector, 1:2*op.k)) + mul!(view(data.intermediary_vector, 1:2*data.k), view(data.inverse_intermediate_2, 1:2*data.k, 1:2*data.k), view(data.sol, 1:2*data.k)) + mul!(view(data.sol, 1:2*data.k), view(data.inverse_intermediate_1, 1:2*data.k, 1:2*data.k), view(data.intermediary_vector, 1:2*data.k)) # step 7 - mul!(Bv, view(op.Yₖ, :, 1:op.k), view(op.sol, 1:op.k)) - mul!(Bv, view(op.Sₖ, :, 1:op.k), view(op.sol, op.k+1:2*op.k), - op.α, (T)(-1)) - Bv .+= op.α .* v + mul!(Bv, view(data.Yₖ, :, 1:data.k), view(data.sol, 1:data.k)) + mul!(Bv, view(data.Sₖ, :, 1:data.k), view(data.sol, data.k+1:2*data.k), - data.α, (T)(-1)) + Bv .+= data.α .* v return Bv +end + +""" + reset!(op) + +Resets the CompressedLBFGS data of the given operator. +""" +function reset!(op::CompressedLBFGSOperator) + op.data.nprod = 0 + return op end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 9538c177..ccc3895d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,16 +1,16 @@ using Arpack, Test, TestSetExtensions, LinearOperators using LinearAlgebra, SparseArrays -include("test_aux.jl") +# include("test_aux.jl") -include("test_linop.jl") -include("test_linop_allocs.jl") -include("test_adjtrans.jl") -include("test_cat.jl") -include("test_lbfgs.jl") +# include("test_linop.jl") +# include("test_linop_allocs.jl") +# include("test_adjtrans.jl") +# include("test_cat.jl") +# include("test_lbfgs.jl") include("test_clbfgs.jl") -include("test_lsr1.jl") -include("test_kron.jl") -include("test_callable.jl") -include("test_deprecated.jl") -include("test_normest.jl") -include("test_diag.jl") +# include("test_lsr1.jl") +# include("test_kron.jl") +# include("test_callable.jl") +# include("test_deprecated.jl") +# include("test_normest.jl") +# include("test_diag.jl")