Skip to content

Commit b4c2970

Browse files
author
RAYNAUD Paul (raynaudp)
committed
Reduce allocations in push! + modify tests
1 parent cab923d commit b4c2970

File tree

2 files changed

+56
-35
lines changed

2 files changed

+56
-35
lines changed

src/compressed_lbfgs.jl

Lines changed: 50 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ In addition to this structures which are circurlarly update when `k` reaches `m`
3636
- `inverse_intermediate_2`: a R²ᵏˣ²ᵏ matrix;
3737
- `intermediary_vector`: a vector ∈ Rᵏ to store intermediate solutions;
3838
- `sol`: a vector ∈ Rᵏ to store intermediate solutions;
39-
- `intermediate_structure_updated`: inform if the intermediate structures are up-to-date or not.
4039
This implementation is designed to work either on CPU or GPU.
4140
"""
4241
mutable struct CompressedLBFGS{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}}
@@ -50,54 +49,58 @@ mutable struct CompressedLBFGS{T, M<:AbstractMatrix{T}, V<:AbstractVector{T}}
5049
Lₖ::LowerTriangular{T,M} # m * m
5150

5251
chol_matrix::M # 2m * 2m
52+
intermediate_diagonal::Diagonal{T,V} # m * m
5353
intermediate_1::UpperTriangular{T,M} # 2m * 2m
5454
intermediate_2::LowerTriangular{T,M} # 2m * 2m
5555
inverse_intermediate_1::UpperTriangular{T,M} # 2m * 2m
5656
inverse_intermediate_2::LowerTriangular{T,M} # 2m * 2m
5757
intermediary_vector::V # 2m
5858
sol::V # m
59-
intermediate_structure_updated::Bool
6059
end
6160

6261
default_gpu() = CUDA.functional() ? true : false
63-
default_matrix_type(gpu::Bool, T::DataType) = gpu ? CuMatrix{T} : Matrix{T}
64-
default_vector_type(gpu::Bool, T::DataType) = gpu ? CuVector{T} : Vector{T}
62+
default_matrix_type(gpu::Bool; T::DataType=Float64) = gpu ? CuMatrix{T} : Matrix{T}
63+
default_vector_type(gpu::Bool; T::DataType=Float64) = gpu ? CuVector{T} : Vector{T}
64+
65+
function columnshift!(A::AbstractMatrix{T}; direction::Int=-1, indicemax::Int=size(A)[1]) where T
66+
map(i-> view(A,:,i+direction) .= view(A,:,i), 1-direction:indicemax)
67+
return A
68+
end
6569

6670
"""
6771
CompressedLBFGS(n::Int; [T=Float64, m=5], gpu:Bool)
6872
6973
A implementation of a LBFGS operator (forward), representing a `nxn` linear application.
7074
It considers at most `k` BFGS iterates, and fit the architecture depending if it is launched on a CPU or a GPU.
7175
"""
72-
function CompressedLBFGS(n::Int; m::Int=5, T=Float64, gpu=default_gpu(), M=default_matrix_type(gpu, T), V=default_vector_type(gpu, T))
76+
function CompressedLBFGS(n::Int; m::Int=5, T=Float64, gpu=default_gpu(), M=default_matrix_type(gpu; T), V=default_vector_type(gpu; T))
7377
α = (T)(1)
7478
k = 0
7579
Sₖ = M(undef, n, m)
7680
Yₖ = M(undef, n, m)
7781
Dₖ = Diagonal(V(undef, m))
7882
Lₖ = LowerTriangular(M(undef, m, m))
83+
Lₖ .= (T)(0)
7984

8085
chol_matrix = M(undef, m, m)
86+
intermediate_diagonal = Diagonal(V(undef, m))
8187
intermediate_1 = UpperTriangular(M(undef, 2*m, 2*m))
8288
intermediate_2 = LowerTriangular(M(undef, 2*m, 2*m))
8389
inverse_intermediate_1 = UpperTriangular(M(undef, 2*m, 2*m))
8490
inverse_intermediate_2 = LowerTriangular(M(undef, 2*m, 2*m))
8591
intermediary_vector = V(undef, 2*m)
8692
sol = V(undef, 2*m)
87-
intermediate_structure_updated = false
88-
return CompressedLBFGS{T,M,V}(m, n, k, α, Sₖ, Yₖ, Dₖ, Lₖ, chol_matrix, intermediate_1, intermediate_2, inverse_intermediate_1, inverse_intermediate_2, intermediary_vector, sol, intermediate_structure_updated)
93+
return CompressedLBFGS{T,M,V}(m, n, k, α, Sₖ, Yₖ, Dₖ, Lₖ, chol_matrix, intermediate_diagonal, intermediate_1, intermediate_2, inverse_intermediate_1, inverse_intermediate_2, intermediary_vector, sol)
8994
end
9095

9196
function Base.push!(op::CompressedLBFGS{T,M,V}, s::V, y::V) where {T,M,V<:AbstractVector{T}}
9297
if op.k < op.m # still some place in the structures
9398
op.k += 1
94-
op.Sₖ[:, op.k] .= s
95-
op.Yₖ[:, op.k] .= y
96-
op.Dₖ.diag[op.k] = dot(s, y)
97-
op.Lₖ.data[op.k, op.k] = 0
98-
for i in 1:op.k-1
99-
op.Lₖ.data[op.k, i] = dot(op.Sₖ[:, op.k], op.Yₖ[:, i])
100-
end
99+
view(op.Sₖ, :, op.k) .= s
100+
view(op.Yₖ, :, op.k) .= y
101+
view(op.Dₖ.diag, op.k) .= dot(s, y)
102+
mul!(view(op.Lₖ.data, op.k, 1:op.k-1), transpose(view(op.Yₖ, :, 1:op.k-1)), view(op.Sₖ, :, op.k) )
103+
101104
else # k == m update circurlarly the intermediary structures
102105
op.Sₖ .= circshift(op.Sₖ, (0, -1))
103106
op.Yₖ .= circshift(op.Yₖ, (0, -1))
@@ -109,13 +112,16 @@ function Base.push!(op::CompressedLBFGS{T,M,V}, s::V, y::V) where {T,M,V<:Abstra
109112
# for the time being, reinstantiate completely the Lₖ matrix
110113
for j in 1:op.k
111114
for i in 1:j-1
112-
op.Lₖ.data[j, i] = dot(op.Sₖ[:, j], op.Yₖ[:, i])
115+
op.Lₖ.data[j, i] = dot(view(op.Sₖ,:, j), view(op.Yₖ, :, i))
113116
end
114117
end
115118
end
119+
120+
# step 4 and 6
121+
precompile_iterated_structure!(op)
122+
116123
# secant equation fails if uncommented
117124
# op.α = dot(y,s)/dot(s,s)
118-
op.intermediate_structure_updated = false
119125
return op
120126
end
121127

@@ -141,8 +147,18 @@ end
141147

142148
# Algorithm 3.2 (p15)
143149
# step 4, Jₖ is computed only if needed
144-
function inverse_cholesky(op::CompressedLBFGS)
145-
view(op.chol_matrix, 1:op.k, 1:op.k) .= op.α .* (transpose(view(op.Sₖ, :, 1:op.k)) * view(op.Sₖ, :, 1:op.k)) .+ view(op.Lₖ, 1:op.k, 1:op.k) * inv(Diagonal(op.Dₖ[1:op.k, 1:op.k])) * transpose(view(op.Lₖ, 1:op.k, 1:op.k))
150+
function inverse_cholesky(op::CompressedLBFGS{T,M,V}) where {T,M,V}
151+
view(op.intermediate_diagonal.diag, 1:op.k) .= inv.(view(op.Dₖ.diag, 1:op.k))
152+
153+
# view(op.Lₖ, 1:op.k, 1:op.k) * inv(Diagonal(op.Dₖ[1:op.k, 1:op.k])) * transpose(view(op.Lₖ, 1:op.k, 1:op.k))
154+
mul!(view(op.inverse_intermediate_1, 1:op.k, 1:op.k), view(op.intermediate_diagonal, 1:op.k, 1:op.k), transpose(view(op.Lₖ, 1:op.k, 1:op.k)))
155+
mul!(view(op.chol_matrix, 1:op.k, 1:op.k), view(op.Lₖ, 1:op.k, 1:op.k), view(op.inverse_intermediate_1, 1:op.k, 1:op.k))
156+
157+
# view(op.chol_matrix, 1:op.k, 1:op.k) .= op.α .* (transpose(view(op.Sₖ, :, 1:op.k)) * view(op.Sₖ, :, 1:op.k))
158+
mul!(view(op.chol_matrix, 1:op.k, 1:op.k), transpose(view(op.Sₖ, :, 1:op.k)), view(op.Sₖ, :, 1:op.k), op.α, (T)(1))
159+
160+
# view(op.chol_matrix, 1:op.k, 1:op.k) .= op.α .* (transpose(view(op.Sₖ, :, 1:op.k)) * view(op.Sₖ, :, 1:op.k)) .+ view(op.Lₖ, 1:op.k, 1:op.k) * inv(Diagonal(op.Dₖ[1:op.k, 1:op.k])) * transpose(view(op.Lₖ, 1:op.k, 1:op.k))
161+
146162
cholesky!(Symmetric(view(op.chol_matrix, 1:op.k, 1:op.k)))
147163
Jₖ = transpose(UpperTriangular(view(op.chol_matrix, 1:op.k, 1:op.k)))
148164
return Jₖ
@@ -152,29 +168,32 @@ end
152168
function precompile_iterated_structure!(op::CompressedLBFGS)
153169
Jₖ = inverse_cholesky(op)
154170

155-
view(op.intermediate_1, 1:op.k,1:op.k) .= .- view(op.Dₖ, 1:op.k, 1:op.k)^(1/2)
156-
view(op.intermediate_1, 1:op.k,op.k+1:2*op.k) .= view(op.Dₖ, 1:op.k, 1:op.k)^(-1/2) * transpose(view(op.Lₖ, 1:op.k, 1:op.k))
171+
# constant update
157172
view(op.intermediate_1, op.k+1:2*op.k, 1:op.k) .= 0
158-
view(op.intermediate_1, op.k+1:2*op.k, op.k+1:2*op.k) .= transpose(Jₖ)
159-
160-
view(op.intermediate_2, 1:op.k, 1:op.k) .= view(op.Dₖ, 1:op.k, 1:op.k)^(1/2)
161173
view(op.intermediate_2, 1:op.k, op.k+1:2*op.k) .= 0
162-
view(op.intermediate_2, op.k+1:2*op.k, 1:op.k) .= .- view(op.Lₖ, 1:op.k, 1:op.k) * view(op.Dₖ, 1:op.k, 1:op.k)^(-1/2)
174+
view(op.intermediate_1, op.k+1:2*op.k, op.k+1:2*op.k) .= transpose(Jₖ)
163175
view(op.intermediate_2, op.k+1:2*op.k, op.k+1:2*op.k) .= Jₖ
164176

177+
# updates related to D^(1/2)
178+
view(op.intermediate_diagonal.diag, 1:op.k) .= sqrt.(view(op.Dₖ.diag, 1:op.k))
179+
view(op.intermediate_1, 1:op.k,1:op.k) .= .- view(op.intermediate_diagonal, 1:op.k, 1:op.k)
180+
view(op.intermediate_2, 1:op.k, 1:op.k) .= view(op.intermediate_diagonal, 1:op.k, 1:op.k)
181+
182+
# updates related to D^(-1/2)
183+
view(op.intermediate_diagonal.diag, 1:op.k) .= (x -> 1/sqrt(x)).(view(op.Dₖ.diag, 1:op.k))
184+
mul!(view(op.intermediate_1, 1:op.k,op.k+1:2*op.k), view(op.intermediate_diagonal, 1:op.k, 1:op.k), transpose(view(op.Lₖ, 1:op.k, 1:op.k)))
185+
# view(op.intermediate_1, 1:op.k,op.k+1:2*op.k) .= view(op.Dₖ, 1:op.k, 1:op.k)^(-1/2) * transpose(view(op.Lₖ, 1:op.k, 1:op.k))
186+
mul!(view(op.intermediate_2, op.k+1:2*op.k, 1:op.k), view(op.Lₖ, 1:op.k, 1:op.k), view(op.intermediate_diagonal, 1:op.k, 1:op.k))
187+
view(op.intermediate_2, op.k+1:2*op.k, 1:op.k) .= view(op.intermediate_2, op.k+1:2*op.k, 1:op.k) .* -1
188+
# view(op.intermediate_2, op.k+1:2*op.k, 1:op.k) .= .- view(op.Lₖ, 1:op.k, 1:op.k) * view(op.Dₖ, 1:op.k, 1:op.k)^(-1/2)
189+
165190
view(op.inverse_intermediate_1, 1:2*op.k, 1:2*op.k) .= inv(op.intermediate_1[1:2*op.k, 1:2*op.k])
166191
view(op.inverse_intermediate_2, 1:2*op.k, 1:2*op.k) .= inv(op.intermediate_2[1:2*op.k, 1:2*op.k])
167-
168-
op.intermediate_structure_updated = true
169192
end
170193

171194
# Algorithm 3.2 (p15)
172195
function LinearAlgebra.mul!(Bv::V, op::CompressedLBFGS{T,M,V}, v::V) where {T,M,V<:AbstractVector{T}}
173-
# step 1-3 mainly done by Base.push!
174-
175-
# steps 4 and 6, in case the intermediary structures required are not up to date
176-
(!op.intermediate_structure_updated) && (precompile_iterated_structure!(op))
177-
196+
# step 1-4 and 6 mainly done by Base.push!
178197
# step 5
179198
mul!(view(op.sol, 1:op.k), transpose(view(op.Yₖ, :, 1:op.k)), v)
180199
mul!(view(op.sol, op.k+1:2*op.k), transpose(view(op.Sₖ, :, 1:op.k)), v)

test/test_clbfgs.jl

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,15 @@
33
n=100
44
n=5
55
lbfgs = CompressedLBFGS(n) # m=5
6-
Bv = rand(n)
6+
V = LinearOperators.default_vector_type(LinearOperators.default_gpu())
7+
Bv = V(rand(n))
8+
s = V(rand(n))
9+
mul!(Bv, lbfgs, s) # warm-up
710
for i in 1:iter
8-
s = rand(n)
9-
y = rand(n)
11+
s = V(rand(n))
12+
y = V(rand(n))
1013
push!(lbfgs, s, y)
1114
# warmp-up computing the mandatory intermediate structures
12-
mul!(Bv, lbfgs, s)
1315
allocs = @allocated mul!(Bv, lbfgs, s)
1416
@test allocs == 0
1517
@test Bv y

0 commit comments

Comments
 (0)