diff --git a/.github/workflows/Tests.yml b/.github/workflows/Tests.yml index f648f8b103..54806c2f2d 100644 --- a/.github/workflows/Tests.yml +++ b/.github/workflows/Tests.yml @@ -24,8 +24,6 @@ jobs: test: # Run some of the slower test files individually. The last one catches everything # not included in the others. - - name: "essential/ad" - args: "essential/ad.jl" - name: "mcmc/gibbs" args: "mcmc/gibbs.jl" - name: "mcmc/hmc" @@ -37,7 +35,7 @@ jobs: - name: "mcmc/ess" args: "mcmc/ess.jl" - name: "everything else" - args: "--skip essential/ad.jl mcmc/gibbs.jl mcmc/hmc.jl mcmc/abstractmcmc.jl mcmc/Inference.jl mcmc/ess.jl" + args: "--skip mcmc/gibbs.jl mcmc/hmc.jl mcmc/abstractmcmc.jl mcmc/Inference.jl mcmc/ess.jl" runner: # Default - version: '1' diff --git a/Manifest.toml b/Manifest.toml index 2ce2ec80dd..540a7da842 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -2,7 +2,7 @@ julia_version = "1.11.3" manifest_format = "2.0" -project_hash = "83ec9face19bc568fc30cc287161517dc49f6c5c" +project_hash = "afdf28a30966aaa4af542a30879dd92074661565" [[deps.ADTypes]] git-tree-sha1 = "fb97701c117c8162e84dfcf80215caa904aef44f" diff --git a/Project.toml b/Project.toml index 9cfe795d91..d72f323373 100644 --- a/Project.toml +++ b/Project.toml @@ -23,7 +23,6 @@ ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" Libtask = "6f1fad26-d15e-5dc8-ae53-837a1d7b8c9f" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LogDensityProblems = "6fdf6af0-433a-55f7-b3ed-c6c6e0b8df7c" -LogDensityProblemsAD = "996a588d-648d-4e1f-a8f0-a84b347e47b1" MCMCChains = "c7f686f2-ff18-58e9-bc7b-31028e88f75d" NamedArrays = "86f7a689-2022-50b4-a561-43c23ac3c673" Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba" @@ -69,7 +68,6 @@ ForwardDiff = "0.10.3" Libtask = "0.8.8" LinearAlgebra = "1" LogDensityProblems = "2" -LogDensityProblemsAD = "1.7.0" MCMCChains = "5, 6" NamedArrays = "0.9, 0.10" Optim = "1" diff --git a/ext/TuringDynamicHMCExt.jl b/ext/TuringDynamicHMCExt.jl index 1b47ab5c01..5718e3855a 100644 --- a/ext/TuringDynamicHMCExt.jl +++ b/ext/TuringDynamicHMCExt.jl @@ -3,17 +3,10 @@ module TuringDynamicHMCExt ### DynamicHMC backend - https://github.com/tpapp/DynamicHMC.jl ### -if isdefined(Base, :get_extension) - using DynamicHMC: DynamicHMC - using Turing - using Turing: AbstractMCMC, Random, LogDensityProblems, DynamicPPL - using Turing.Inference: ADTypes, LogDensityProblemsAD, TYPEDFIELDS -else - import ..DynamicHMC - using ..Turing - using ..Turing: AbstractMCMC, Random, LogDensityProblems, DynamicPPL - using ..Turing.Inference: ADTypes, LogDensityProblemsAD, TYPEDFIELDS -end +using DynamicHMC: DynamicHMC +using Turing +using Turing: AbstractMCMC, Random, LogDensityProblems, DynamicPPL +using Turing.Inference: ADTypes, TYPEDFIELDS """ DynamicNUTS @@ -69,10 +62,11 @@ function DynamicPPL.initialstep( end # Define log-density function. - ℓ = LogDensityProblemsAD.ADgradient( - Turing.LogDensityFunction( - model, vi, DynamicPPL.SamplingContext(spl, DynamicPPL.DefaultContext()) - ), + ℓ = DynamicPPL.LogDensityFunction( + model, + vi, + DynamicPPL.SamplingContext(spl, DynamicPPL.DefaultContext()); + adtype=spl.alg.adtype, ) # Perform initial step. diff --git a/ext/TuringOptimExt.jl b/ext/TuringOptimExt.jl index 3d31af6f2a..d6c253e2a2 100644 --- a/ext/TuringOptimExt.jl +++ b/ext/TuringOptimExt.jl @@ -1,14 +1,8 @@ module TuringOptimExt -if isdefined(Base, :get_extension) - using Turing: Turing - import Turing: DynamicPPL, NamedArrays, Accessors, Optimisation - using Optim: Optim -else - import ..Turing - import ..Turing: DynamicPPL, NamedArrays, Accessors, Optimisation - import ..Optim -end +using Turing: Turing +import Turing: DynamicPPL, NamedArrays, Accessors, Optimisation +using Optim: Optim #################### # Optim.jl methods # @@ -42,7 +36,7 @@ function Optim.optimize( ) ctx = Optimisation.OptimizationContext(DynamicPPL.LikelihoodContext()) f = Optimisation.OptimLogDensity(model, ctx) - init_vals = DynamicPPL.getparams(f) + init_vals = DynamicPPL.getparams(f.ldf) optimizer = Optim.LBFGS() return _mle_optimize(model, init_vals, optimizer, options; kwargs...) end @@ -65,7 +59,7 @@ function Optim.optimize( ) ctx = Optimisation.OptimizationContext(DynamicPPL.LikelihoodContext()) f = Optimisation.OptimLogDensity(model, ctx) - init_vals = DynamicPPL.getparams(f) + init_vals = DynamicPPL.getparams(f.ldf) return _mle_optimize(model, init_vals, optimizer, options; kwargs...) end function Optim.optimize( @@ -81,7 +75,7 @@ end function _mle_optimize(model::DynamicPPL.Model, args...; kwargs...) ctx = Optimisation.OptimizationContext(DynamicPPL.LikelihoodContext()) - return _optimize(model, Optimisation.OptimLogDensity(model, ctx), args...; kwargs...) + return _optimize(Optimisation.OptimLogDensity(model, ctx), args...; kwargs...) end """ @@ -112,7 +106,7 @@ function Optim.optimize( ) ctx = Optimisation.OptimizationContext(DynamicPPL.DefaultContext()) f = Optimisation.OptimLogDensity(model, ctx) - init_vals = DynamicPPL.getparams(f) + init_vals = DynamicPPL.getparams(f.ldf) optimizer = Optim.LBFGS() return _map_optimize(model, init_vals, optimizer, options; kwargs...) end @@ -135,7 +129,7 @@ function Optim.optimize( ) ctx = Optimisation.OptimizationContext(DynamicPPL.DefaultContext()) f = Optimisation.OptimLogDensity(model, ctx) - init_vals = DynamicPPL.getparams(f) + init_vals = DynamicPPL.getparams(f.ldf) return _map_optimize(model, init_vals, optimizer, options; kwargs...) end function Optim.optimize( @@ -151,18 +145,16 @@ end function _map_optimize(model::DynamicPPL.Model, args...; kwargs...) ctx = Optimisation.OptimizationContext(DynamicPPL.DefaultContext()) - return _optimize(model, Optimisation.OptimLogDensity(model, ctx), args...; kwargs...) + return _optimize(Optimisation.OptimLogDensity(model, ctx), args...; kwargs...) end - """ - _optimize(model::Model, f::OptimLogDensity, optimizer=Optim.LBFGS(), args...; kwargs...) + _optimize(f::OptimLogDensity, optimizer=Optim.LBFGS(), args...; kwargs...) Estimate a mode, i.e., compute a MLE or MAP estimate. """ function _optimize( - model::DynamicPPL.Model, f::Optimisation.OptimLogDensity, - init_vals::AbstractArray=DynamicPPL.getparams(f), + init_vals::AbstractArray=DynamicPPL.getparams(f.ldf), optimizer::Optim.AbstractOptimizer=Optim.LBFGS(), options::Optim.Options=Optim.Options(), args...; @@ -170,9 +162,12 @@ function _optimize( ) # Convert the initial values, since it is assumed that users provide them # in the constrained space. - f = Accessors.@set f.varinfo = DynamicPPL.unflatten(f.varinfo, init_vals) - f = Accessors.@set f.varinfo = DynamicPPL.link(f.varinfo, model) - init_vals = DynamicPPL.getparams(f) + # TODO(penelopeysm): As with in src/optimisation/Optimisation.jl, unclear + # whether initialisation is really necessary at all + vi = DynamicPPL.unflatten(f.ldf.varinfo, init_vals) + vi = DynamicPPL.link(vi, f.ldf.model) + f = Optimisation.OptimLogDensity(f.ldf.model, vi, f.ldf.context; adtype=f.ldf.adtype) + init_vals = DynamicPPL.getparams(f.ldf) # Optimize! M = Optim.optimize(Optim.only_fg!(f), init_vals, optimizer, options, args...; kwargs...) @@ -186,12 +181,16 @@ function _optimize( end # Get the optimum in unconstrained space. `getparams` does the invlinking. - f = Accessors.@set f.varinfo = DynamicPPL.unflatten(f.varinfo, M.minimizer) - vns_vals_iter = Turing.Inference.getparams(model, f.varinfo) + vi = f.ldf.varinfo + vi_optimum = DynamicPPL.unflatten(vi, M.minimizer) + logdensity_optimum = Optimisation.OptimLogDensity( + f.ldf.model, vi_optimum, f.ldf.context + ) + vns_vals_iter = Turing.Inference.getparams(f.ldf.model, vi_optimum) varnames = map(Symbol ∘ first, vns_vals_iter) vals = map(last, vns_vals_iter) vmat = NamedArrays.NamedArray(vals, varnames) - return Optimisation.ModeResult(vmat, M, -M.minimum, f) + return Optimisation.ModeResult(vmat, M, -M.minimum, logdensity_optimum) end end # module diff --git a/src/mcmc/Inference.jl b/src/mcmc/Inference.jl index 3c430c4bde..31308f7768 100644 --- a/src/mcmc/Inference.jl +++ b/src/mcmc/Inference.jl @@ -48,7 +48,6 @@ import AdvancedPS import Accessors import EllipticalSliceSampling import LogDensityProblems -import LogDensityProblemsAD import Random import MCMCChains import StatsBase: predict @@ -160,29 +159,6 @@ function externalsampler( return ExternalSampler(sampler, adtype, Val(unconstrained)) end -getADType(spl::Sampler) = getADType(spl.alg) -getADType(::SampleFromPrior) = Turing.DEFAULT_ADTYPE - -getADType(ctx::DynamicPPL.SamplingContext) = getADType(ctx.sampler) -getADType(ctx::DynamicPPL.AbstractContext) = getADType(DynamicPPL.NodeTrait(ctx), ctx) -getADType(::DynamicPPL.IsLeaf, ctx::DynamicPPL.AbstractContext) = Turing.DEFAULT_ADTYPE -function getADType(::DynamicPPL.IsParent, ctx::DynamicPPL.AbstractContext) - return getADType(DynamicPPL.childcontext(ctx)) -end - -getADType(alg::Hamiltonian) = alg.adtype - -function LogDensityProblemsAD.ADgradient(ℓ::DynamicPPL.LogDensityFunction) - return LogDensityProblemsAD.ADgradient(getADType(ℓ.context), ℓ) -end - -function LogDensityProblems.logdensity( - f::Turing.LogDensityFunction{<:AbstractVarInfo,<:Model,<:DynamicPPL.DefaultContext}, - x::NamedTuple, -) - return DynamicPPL.logjoint(f.model, DynamicPPL.unflatten(f.varinfo, x)) -end - # TODO: make a nicer `set_namedtuple!` and move these functions to DynamicPPL. function DynamicPPL.unflatten(vi::TypedVarInfo, θ::NamedTuple) set_namedtuple!(deepcopy(vi), θ) diff --git a/src/mcmc/abstractmcmc.jl b/src/mcmc/abstractmcmc.jl index c582ab9856..b436f43579 100644 --- a/src/mcmc/abstractmcmc.jl +++ b/src/mcmc/abstractmcmc.jl @@ -1,6 +1,6 @@ -struct TuringState{S,F} +struct TuringState{S,M,V,C} state::S - logdensity::F + ldf::DynamicPPL.LogDensityFunction{M,V,C} end state_to_turing(f::DynamicPPL.LogDensityFunction, state) = TuringState(state, f) @@ -12,20 +12,10 @@ function transition_to_turing(f::DynamicPPL.LogDensityFunction, transition) return Transition(f.model, varinfo, transition) end -state_to_turing(f::LogDensityProblemsAD.ADGradientWrapper, state) = TuringState(state, f) -function transition_to_turing(f::LogDensityProblemsAD.ADGradientWrapper, transition) - return transition_to_turing(parent(f), transition) -end - -function varinfo_from_logdensityfn(f::LogDensityProblemsAD.ADGradientWrapper) - return varinfo_from_logdensityfn(parent(f)) -end -varinfo_from_logdensityfn(f::DynamicPPL.LogDensityFunction) = f.varinfo - function varinfo(state::TuringState) - θ = getparams(DynamicPPL.getmodel(state.logdensity), state.state) + θ = getparams(state.ldf.model, state.state) # TODO: Do we need to link here first? - return DynamicPPL.unflatten(varinfo_from_logdensityfn(state.logdensity), θ) + return DynamicPPL.unflatten(state.ldf.varinfo, θ) end varinfo(state::AbstractVarInfo) = state @@ -40,23 +30,6 @@ getstats(transition::AdvancedHMC.Transition) = transition.stat getparams(::DynamicPPL.Model, transition::AdvancedMH.Transition) = transition.params -getvarinfo(f::DynamicPPL.LogDensityFunction) = f.varinfo -function getvarinfo(f::LogDensityProblemsAD.ADGradientWrapper) - return getvarinfo(LogDensityProblemsAD.parent(f)) -end - -function setvarinfo(f::DynamicPPL.LogDensityFunction, varinfo) - return DynamicPPL.LogDensityFunction(f.model, varinfo, f.context; adtype=f.adtype) -end - -function setvarinfo( - f::LogDensityProblemsAD.ADGradientWrapper, varinfo, adtype::ADTypes.AbstractADType -) - return LogDensityProblemsAD.ADgradient( - adtype, setvarinfo(LogDensityProblemsAD.parent(f), varinfo) - ) -end - # TODO: Do we also support `resume`, etc? function AbstractMCMC.step( rng::Random.AbstractRNG, @@ -69,12 +42,8 @@ function AbstractMCMC.step( alg = sampler_wrapper.alg sampler = alg.sampler - # Create a log-density function with an implementation of the - # gradient so we ensure that we're using the same AD backend as in Turing. - f = LogDensityProblemsAD.ADgradient(alg.adtype, DynamicPPL.LogDensityFunction(model)) - - # Link the varinfo if needed. - varinfo = getvarinfo(f) + # Initialise varinfo with initial params and link the varinfo if needed. + varinfo = DynamicPPL.VarInfo(model) if requires_unconstrained_space(alg) if initial_params !== nothing # If we have initial parameters, we need to set the varinfo before linking. @@ -85,9 +54,11 @@ function AbstractMCMC.step( varinfo = DynamicPPL.link(varinfo, model) end end - f = setvarinfo(f, varinfo, alg.adtype) - # Then just call `AdvancedHMC.step` with the right arguments. + # Construct LogDensityFunction + f = DynamicPPL.LogDensityFunction(model, varinfo; adtype=alg.adtype) + + # Then just call `AbstractMCMC.step` with the right arguments. if initial_state === nothing transition_inner, state_inner = AbstractMCMC.step( rng, AbstractMCMC.LogDensityModel(f), sampler; initial_params, kwargs... @@ -114,7 +85,7 @@ function AbstractMCMC.step( kwargs..., ) sampler = sampler_wrapper.alg.sampler - f = state.logdensity + f = state.ldf # Then just call `AdvancedHMC.step` with the right arguments. transition_inner, state_inner = AbstractMCMC.step( diff --git a/src/mcmc/gibbs.jl b/src/mcmc/gibbs.jl index df9e8ff24d..e447bd2fc8 100644 --- a/src/mcmc/gibbs.jl +++ b/src/mcmc/gibbs.jl @@ -438,7 +438,7 @@ function setparams_varinfo!!( state::TuringState, params::AbstractVarInfo, ) - logdensity = DynamicPPL.setmodel(state.logdensity, model, sampler.alg.adtype) + logdensity = DynamicPPL.setmodel(state.ldf, model, sampler.alg.adtype) new_inner_state = setparams_varinfo!!( AbstractMCMC.LogDensityModel(logdensity), sampler, state.state, params ) diff --git a/src/mcmc/hmc.jl b/src/mcmc/hmc.jl index 1550c169f0..098e62eb22 100644 --- a/src/mcmc/hmc.jl +++ b/src/mcmc/hmc.jl @@ -156,19 +156,19 @@ function DynamicPPL.initialstep( # Create a Hamiltonian. metricT = getmetricT(spl.alg) metric = metricT(length(theta)) - ℓ = LogDensityProblemsAD.ADgradient( - Turing.LogDensityFunction( - model, - vi, - # Use the leaf-context from the `model` in case the user has - # contextualized the model with something like `PriorContext` - # to sample from the prior. - DynamicPPL.SamplingContext(rng, spl, DynamicPPL.leafcontext(model.context)), - ), + ldf = DynamicPPL.LogDensityFunction( + model, + vi, + # TODO(penelopeysm): Can we just use leafcontext(model.context)? Do we + # need to pass in the sampler? (In fact LogDensityFunction defaults to + # using leafcontext(model.context) so could we just remove the argument + # entirely?) + DynamicPPL.SamplingContext(rng, spl, DynamicPPL.leafcontext(model.context)); + adtype=spl.alg.adtype, ) - logπ = Base.Fix1(LogDensityProblems.logdensity, ℓ) - ∂logπ∂θ(x) = LogDensityProblems.logdensity_and_gradient(ℓ, x) - hamiltonian = AHMC.Hamiltonian(metric, logπ, ∂logπ∂θ) + lp_func = Base.Fix1(LogDensityProblems.logdensity, ldf) + lp_grad_func = Base.Fix1(LogDensityProblems.logdensity_and_gradient, ldf) + hamiltonian = AHMC.Hamiltonian(metric, lp_func, lp_grad_func) # Compute phase point z. z = AHMC.phasepoint(rng, theta, hamiltonian) @@ -191,7 +191,7 @@ function DynamicPPL.initialstep( vi = last(DynamicPPL.evaluate!!(model, rng, vi, SampleFromUniform())) theta = vi[:] - hamiltonian = AHMC.Hamiltonian(metric, logπ, ∂logπ∂θ) + hamiltonian = AHMC.Hamiltonian(metric, lp_func, lp_grad_func) z = AHMC.phasepoint(rng, theta, hamiltonian) init_attempt_count += 1 @@ -287,16 +287,19 @@ end function get_hamiltonian(model, spl, vi, state, n) metric = gen_metric(n, spl, state) - ℓ = LogDensityProblemsAD.ADgradient( - Turing.LogDensityFunction( - model, - vi, - DynamicPPL.SamplingContext(spl, DynamicPPL.leafcontext(model.context)), - ), + ldf = DynamicPPL.LogDensityFunction( + model, + vi, + # TODO(penelopeysm): Can we just use leafcontext(model.context)? Do we + # need to pass in the sampler? (In fact LogDensityFunction defaults to + # using leafcontext(model.context) so could we just remove the argument + # entirely?) + DynamicPPL.SamplingContext(spl, DynamicPPL.leafcontext(model.context)); + adtype=spl.alg.adtype, ) - ℓπ = Base.Fix1(LogDensityProblems.logdensity, ℓ) - ∂ℓπ∂θ = Base.Fix1(LogDensityProblems.logdensity_and_gradient, ℓ) - return AHMC.Hamiltonian(metric, ℓπ, ∂ℓπ∂θ) + lp_func = Base.Fix1(LogDensityProblems.logdensity, ldf) + lp_grad_func = Base.Fix1(LogDensityProblems.logdensity_and_gradient, ldf) + return AHMC.Hamiltonian(metric, lp_func, lp_grad_func) end """ diff --git a/src/mcmc/sghmc.jl b/src/mcmc/sghmc.jl index 593acc7cb6..9a6124e14a 100644 --- a/src/mcmc/sghmc.jl +++ b/src/mcmc/sghmc.jl @@ -66,10 +66,11 @@ function DynamicPPL.initialstep( # Compute initial sample and state. sample = Transition(model, vi) - ℓ = LogDensityProblemsAD.ADgradient( - Turing.LogDensityFunction( - model, vi, DynamicPPL.SamplingContext(spl, DynamicPPL.DefaultContext()) - ), + ℓ = DynamicPPL.LogDensityFunction( + model, + vi, + DynamicPPL.SamplingContext(spl, DynamicPPL.DefaultContext()); + adtype=spl.alg.adtype, ) state = SGHMCState(ℓ, vi, zero(vi[spl])) @@ -228,10 +229,11 @@ function DynamicPPL.initialstep( # Create first sample and state. sample = SGLDTransition(model, vi, zero(spl.alg.stepsize(0))) - ℓ = LogDensityProblemsAD.ADgradient( - Turing.LogDensityFunction( - model, vi, DynamicPPL.SamplingContext(spl, DynamicPPL.DefaultContext()) - ), + ℓ = DynamicPPL.LogDensityFunction( + model, + vi, + DynamicPPL.SamplingContext(spl, DynamicPPL.DefaultContext()); + adtype=spl.alg.adtype, ) state = SGLDState(ℓ, vi, 1) diff --git a/src/optimisation/Optimisation.jl b/src/optimisation/Optimisation.jl index eac57b1f80..3d8ce1d4a3 100644 --- a/src/optimisation/Optimisation.jl +++ b/src/optimisation/Optimisation.jl @@ -4,7 +4,6 @@ using ..Turing using NamedArrays: NamedArrays using DynamicPPL: DynamicPPL using LogDensityProblems: LogDensityProblems -using LogDensityProblemsAD: LogDensityProblemsAD using Optimization: Optimization using OptimizationOptimJL: OptimizationOptimJL using Random: Random @@ -95,22 +94,71 @@ function DynamicPPL.tilde_observe( end """ - OptimLogDensity{M<:DynamicPPL.Model,C<:Context,V<:DynamicPPL.VarInfo} + OptimLogDensity{ + M<:DynamicPPL.Model, + V<:DynamicPPL.VarInfo, + C<:OptimizationContext, + AD<:ADTypes.AbstractADType + } + +A struct that wraps a single LogDensityFunction. Can be invoked either using + +```julia +OptimLogDensity(model, varinfo, ctx; adtype=adtype) +``` + +or + +```julia +OptimLogDensity(model, ctx; adtype=adtype) +``` + +If not specified, `adtype` defaults to `AutoForwardDiff()`. + +An OptimLogDensity does not, in itself, obey the LogDensityProblems interface. +Thus, if you want to calculate the log density of its contents at the point +`z`, you should manually call -A struct that stores the negative log density function of a `DynamicPPL` model. +```julia +LogDensityProblems.logdensity(f.ldf, z) +``` + +However, it is a callable object which returns the *negative* log density of +the underlying LogDensityFunction at the point `z`. This is done to satisfy +the Optim.jl interface. + +```julia +optim_ld = OptimLogDensity(model, varinfo, ctx) +optim_ld(z) # returns -logp +``` """ -const OptimLogDensity{M<:DynamicPPL.Model,C<:OptimizationContext,V<:DynamicPPL.VarInfo,AD} = Turing.LogDensityFunction{ - M,V,C,AD +struct OptimLogDensity{ + M<:DynamicPPL.Model, + V<:DynamicPPL.VarInfo, + C<:OptimizationContext, + AD<:ADTypes.AbstractADType, } + ldf::Turing.LogDensityFunction{M,V,C,AD} +end -""" - OptimLogDensity(model::DynamicPPL.Model, context::OptimizationContext) +function OptimLogDensity( + model::DynamicPPL.Model, + vi::DynamicPPL.VarInfo, + ctx::OptimizationContext; + adtype::ADTypes.AbstractADType=AutoForwardDiff(), +) + return OptimLogDensity(Turing.LogDensityFunction(model, vi, ctx; adtype=adtype)) +end -Create a callable `OptimLogDensity` struct that evaluates a model using the given `context`. -""" -function OptimLogDensity(model::DynamicPPL.Model, context::OptimizationContext) - init = DynamicPPL.VarInfo(model) - return Turing.LogDensityFunction(model, init, context) +# No varinfo +function OptimLogDensity( + model::DynamicPPL.Model, + ctx::OptimizationContext; + adtype::ADTypes.AbstractADType=AutoForwardDiff(), +) + return OptimLogDensity( + Turing.LogDensityFunction(model, DynamicPPL.VarInfo(model), ctx; adtype=adtype) + ) end """ @@ -123,40 +171,30 @@ depends on the context of `f`. Any second argument is ignored. The two-argument method only exists to match interface the required by Optimization.jl. """ -function (f::OptimLogDensity)(z::AbstractVector) - varinfo = DynamicPPL.unflatten(f.varinfo, z) - return -DynamicPPL.getlogp(last(DynamicPPL.evaluate!!(f.model, varinfo, f.context))) -end - +(f::OptimLogDensity)(z::AbstractVector) = -LogDensityProblems.logdensity(f.ldf, z) (f::OptimLogDensity)(z, _) = f(z) -# NOTE: This seems a bit weird IMO since this is the _negative_ log-likelihood. -LogDensityProblems.logdensity(f::OptimLogDensity, z::AbstractVector) = f(z) - # NOTE: The format of this function is dictated by Optim. The first argument sets whether to # compute the function value, the second whether to compute the gradient (and stores the # gradient). The last one is the actual argument of the objective function. function (f::OptimLogDensity)(F, G, z) if G !== nothing - # Calculate negative log joint and its gradient. - # TODO: Make OptimLogDensity already an LogDensityProblems.ADgradient? Allow to - # specify AD? - ℓ = LogDensityProblemsAD.ADgradient(f) - neglogp, ∇neglogp = LogDensityProblems.logdensity_and_gradient(ℓ, z) + # Calculate log joint and its gradient. + logp, ∇logp = LogDensityProblems.logdensity_and_gradient(f.ldf, z) - # Save the gradient to the pre-allocated array. - copyto!(G, ∇neglogp) + # Save the negative gradient to the pre-allocated array. + copyto!(G, -∇logp) # If F is something, the negative log joint is requested as well. # We have already computed it as a by-product above and hence return it directly. if F !== nothing - return neglogp + return -logp end end # Only negative log joint requested but no gradient. if F !== nothing - return LogDensityProblems.logdensity(f, z) + return -LogDensityProblems.logdensity(f.ldf, z) end return nothing @@ -232,9 +270,11 @@ function StatsBase.informationmatrix( # Convert the values to their unconstrained states to make sure the # Hessian is computed with respect to the untransformed parameters. - linked = DynamicPPL.istrans(m.f.varinfo) + linked = DynamicPPL.istrans(m.f.ldf.varinfo) if linked - m = Accessors.@set m.f.varinfo = DynamicPPL.invlink!!(m.f.varinfo, m.f.model) + new_vi = DynamicPPL.invlink!!(m.f.ldf.varinfo, m.f.ldf.model) + new_f = OptimLogDensity(m.f.ldf.model, new_vi, m.f.ldf.context) + m = Accessors.@set m.f = new_f end # Calculate the Hessian, which is the information matrix because the negative of the log @@ -244,7 +284,9 @@ function StatsBase.informationmatrix( # Link it back if we invlinked it. if linked - m = Accessors.@set m.f.varinfo = DynamicPPL.link!!(m.f.varinfo, m.f.model) + new_vi = DynamicPPL.link!!(m.f.ldf.varinfo, m.f.ldf.model) + new_f = OptimLogDensity(m.f.ldf.model, new_vi, m.f.ldf.context) + m = Accessors.@set m.f = new_f end return NamedArrays.NamedArray(info, (varnames, varnames)) @@ -265,7 +307,7 @@ Return the values of all the variables with the symbol(s) `var_symbol` in the mo argument should be either a `Symbol` or a vector of `Symbol`s. """ function Base.get(m::ModeResult, var_symbols::AbstractVector{Symbol}) - log_density = m.f + log_density = m.f.ldf # Get all the variable names in the model. This is the same as the list of keys in # m.values, but they are more convenient to filter when they are VarNames rather than # Symbols. @@ -297,9 +339,9 @@ richer format of `ModeResult`. It also takes care of transforming them back to t parameter space in case the optimization was done in a transformed space. """ function ModeResult(log_density::OptimLogDensity, solution::SciMLBase.OptimizationSolution) - varinfo_new = DynamicPPL.unflatten(log_density.varinfo, solution.u) + varinfo_new = DynamicPPL.unflatten(log_density.ldf.varinfo, solution.u) # `getparams` performs invlinking if needed - vns_vals_iter = Turing.Inference.getparams(log_density.model, varinfo_new) + vns_vals_iter = Turing.Inference.getparams(log_density.ldf.model, varinfo_new) syms = map(Symbol ∘ first, vns_vals_iter) vals = map(last, vns_vals_iter) return ModeResult( @@ -383,12 +425,15 @@ end OptimizationProblem(log_density::OptimLogDensity, adtype, constraints) Create an `OptimizationProblem` for the objective function defined by `log_density`. + +Note that the adtype parameter here overrides any adtype parameter the +OptimLogDensity was constructed with. """ function Optimization.OptimizationProblem(log_density::OptimLogDensity, adtype, constraints) # Note that OptimLogDensity is a callable that evaluates the model with given # parameters. Hence we can use it in the objective function as below. f = Optimization.OptimizationFunction(log_density, adtype; cons=constraints.cons) - initial_params = log_density.varinfo[:] + initial_params = log_density.ldf.varinfo[:] prob = if !has_constraints(constraints) Optimization.OptimizationProblem(f, initial_params) else @@ -454,28 +499,34 @@ function estimate_mode( end # Create an OptimLogDensity object that can be used to evaluate the objective function, - # i.e. the negative log density. Set its VarInfo to the initial parameters. - log_density = let - inner_context = if estimator isa MAP - DynamicPPL.DefaultContext() - else - DynamicPPL.LikelihoodContext() - end - ctx = OptimizationContext(inner_context) - ld = OptimLogDensity(model, ctx) - Accessors.@set ld.varinfo = DynamicPPL.unflatten(ld.varinfo, initial_params) + # i.e. the negative log density. + inner_context = if estimator isa MAP + DynamicPPL.DefaultContext() + else + DynamicPPL.LikelihoodContext() end + ctx = OptimizationContext(inner_context) + + # Set its VarInfo to the initial parameters. + # TODO(penelopeysm): Unclear if this is really needed? Any time that logp is calculated + # (using `LogDensityProblems.logdensity(ldf, x)`) the parameters in the + # varinfo are completely ignored. The parameters only matter if you are calling evaluate!! + # directly on the fields of the LogDensityFunction + vi = DynamicPPL.VarInfo(model) + vi = DynamicPPL.unflatten(vi, initial_params) + # Link the varinfo if needed. # TODO(mhauru) We currently couple together the questions of whether the user specified # bounds/constraints and whether we transform the objective function to an # unconstrained space. These should be separate concerns, but for that we need to # implement getting the bounds of the prior distributions. optimise_in_unconstrained_space = !has_constraints(constraints) if optimise_in_unconstrained_space - transformed_varinfo = DynamicPPL.link(log_density.varinfo, log_density.model) - log_density = Accessors.@set log_density.varinfo = transformed_varinfo + vi = DynamicPPL.link(vi, model) end + log_density = OptimLogDensity(model, vi, ctx) + prob = Optimization.OptimizationProblem(log_density, adtype, constraints) solution = Optimization.solve(prob, solver; kwargs...) # TODO(mhauru) We return a ModeResult for compatibility with the older Optim.jl diff --git a/test/essential/ad.jl b/test/essential/ad.jl deleted file mode 100644 index a5fa33e5d4..0000000000 --- a/test/essential/ad.jl +++ /dev/null @@ -1,250 +0,0 @@ -module AdTests - -using ..Models: gdemo_default -using Distributions: logpdf -using DynamicPPL: getlogp, getindex_internal -using ForwardDiff -using LinearAlgebra -using LogDensityProblems: LogDensityProblems -using LogDensityProblemsAD: LogDensityProblemsAD -using ReverseDiff -using Test: @test, @testset -using Turing -using Turing: SampleFromPrior -using Zygote - -function test_model_ad(model, f, syms::Vector{Symbol}) - # Set up VI. - vi = Turing.VarInfo(model) - - # Collect symbols. - vnms = Vector(undef, length(syms)) - vnvals = Vector{Float64}() - for i in 1:length(syms) - s = syms[i] - vnms[i] = getfield(vi.metadata, s).vns[1] - - vals = getindex_internal(vi, vnms[i]) - for i in eachindex(vals) - push!(vnvals, vals[i]) - end - end - - # Compute primal. - x = vec(vnvals) - logp = f(x) - - # Call ForwardDiff's AD directly. - grad_FWAD = sort(ForwardDiff.gradient(f, x)) - - # Compare with `logdensity_and_gradient`. - z = vi[:] - for chunksize in (0, 1, 10), standardtag in (true, false, 0, 3) - ℓ = LogDensityProblemsAD.ADgradient( - Turing.AutoForwardDiff(; chunksize=chunksize, tag=standardtag), - Turing.LogDensityFunction( - model, - vi, - DynamicPPL.SamplingContext(SampleFromPrior(), DynamicPPL.DefaultContext()), - ), - ) - l, ∇E = LogDensityProblems.logdensity_and_gradient(ℓ, z) - - # Compare result - @test l ≈ logp - @test sort(∇E) ≈ grad_FWAD atol = 1e-9 - end -end - -@testset "ad.jl" begin - @testset "adr" begin - ad_test_f = gdemo_default - vi = Turing.VarInfo(ad_test_f) - ad_test_f(vi, SampleFromPrior()) - svn = vi.metadata.s.vns[1] - mvn = vi.metadata.m.vns[1] - _s = getindex_internal(vi, svn)[1] - _m = getindex_internal(vi, mvn)[1] - - dist_s = InverseGamma(2, 3) - - # Hand-written logp - function logp(x::Vector) - s = x[2] - # s = invlink(dist_s, s) - m = x[1] - lik_dist = Normal(m, sqrt(s)) - lp = logpdf(dist_s, s) + logpdf(Normal(0, sqrt(s)), m) - lp += logpdf(lik_dist, 1.5) + logpdf(lik_dist, 2.0) - return lp - end - - # Call ForwardDiff's AD - g = x -> ForwardDiff.gradient(logp, x) - # _s = link(dist_s, _s) - _x = [_m, _s] - grad_FWAD = sort(g(_x)) - - ℓ = Turing.LogDensityFunction( - ad_test_f, - vi, - DynamicPPL.SamplingContext(SampleFromPrior(), DynamicPPL.DefaultContext()), - ) - x = map(x -> Float64(x), vi[:]) - - zygoteℓ = LogDensityProblemsAD.ADgradient(Turing.AutoZygote(), ℓ) - if isdefined(Base, :get_extension) - @test zygoteℓ isa - Base.get_extension( - LogDensityProblemsAD, :LogDensityProblemsADZygoteExt - ).ZygoteGradientLogDensity - else - @test zygoteℓ isa - LogDensityProblemsAD.LogDensityProblemsADZygoteExt.ZygoteGradientLogDensity - end - @test zygoteℓ.ℓ === ℓ - ∇E2 = LogDensityProblems.logdensity_and_gradient(zygoteℓ, x)[2] - @test sort(∇E2) ≈ grad_FWAD atol = 1e-9 - end - - @testset "general AD tests" begin - # Tests gdemo gradient. - function logp1(x::Vector) - dist_s = InverseGamma(2, 3) - s = x[2] - m = x[1] - lik_dist = Normal(m, sqrt(s)) - lp = - Turing.logpdf_with_trans(dist_s, s, false) + - Turing.logpdf_with_trans(Normal(0, sqrt(s)), m, false) - lp += logpdf(lik_dist, 1.5) + logpdf(lik_dist, 2.0) - return lp - end - - test_model_ad(gdemo_default, logp1, [:m, :s]) - - # Test Wishart AD. - @model function wishart_ad() - v ~ Wishart(7, [1 0.5; 0.5 1]) - return v - end - - # Hand-written logp - function logp3(x) - dist_v = Wishart(7, [1 0.5; 0.5 1]) - v = [x[1] x[3]; x[2] x[4]] - lp = Turing.logpdf_with_trans(dist_v, v, false) - return lp - end - - test_model_ad(wishart_ad(), logp3, [:v]) - end - @testset "Simplex Zygote and ReverseDiff (with and without caching) AD" begin - @model function dir() - return theta ~ Dirichlet(1 ./ fill(4, 4)) - end - sample(dir(), HMC(0.01, 1; adtype=AutoZygote()), 1000) - sample(dir(), HMC(0.01, 1; adtype=AutoReverseDiff(; compile=false)), 1000) - sample(dir(), HMC(0.01, 1; adtype=AutoReverseDiff(; compile=true)), 1000) - end - @testset "PDMatDistribution AD" begin - @model function wishart() - return theta ~ Wishart(4, Matrix{Float64}(I, 4, 4)) - end - - sample(wishart(), HMC(0.01, 1; adtype=AutoReverseDiff(; compile=false)), 1000) - sample(wishart(), HMC(0.01, 1; adtype=AutoZygote()), 1000) - - @model function invwishart() - return theta ~ InverseWishart(4, Matrix{Float64}(I, 4, 4)) - end - - sample(invwishart(), HMC(0.01, 1; adtype=AutoReverseDiff(; compile=false)), 1000) - sample(invwishart(), HMC(0.01, 1; adtype=AutoZygote()), 1000) - end - @testset "Hessian test" begin - @model function tst(x, ::Type{TV}=Vector{Float64}) where {TV} - params = TV(undef, 2) - @. params ~ Normal(0, 1) - - return x ~ MvNormal(params, I) - end - - function make_logjoint(model::DynamicPPL.Model, ctx::DynamicPPL.AbstractContext) - # setup - varinfo_init = Turing.VarInfo(model) - spl = DynamicPPL.SampleFromPrior() - varinfo_init = DynamicPPL.link!!(varinfo_init, model) - - function logπ(z; unlinked=false) - varinfo = DynamicPPL.unflatten(varinfo_init, z) - - # TODO(torfjelde): Pretty sure this is a mistake. - # Why are we not linking `varinfo` rather than `varinfo_init`? - if unlinked - varinfo_init = DynamicPPL.invlink!!(varinfo_init, model) - end - varinfo = last( - DynamicPPL.evaluate!!( - model, varinfo, DynamicPPL.SamplingContext(spl, ctx) - ), - ) - if unlinked - varinfo_init = DynamicPPL.link!!(varinfo_init, model) - end - - return -DynamicPPL.getlogp(varinfo) - end - - return logπ - end - - data = [0.5, -0.5] - model = tst(data) - - likelihood = make_logjoint(model, DynamicPPL.LikelihoodContext()) - target(x) = likelihood(x; unlinked=true) - - H_f = ForwardDiff.hessian(target, zeros(2)) - H_r = ReverseDiff.hessian(target, zeros(2)) - @test H_f == [1.0 0.0; 0.0 1.0] - @test H_f == H_r - end - - @testset "memoization: issue #1393" begin - @model function demo(data) - sigma ~ Uniform(0.0, 20.0) - return data ~ Normal(0, sigma) - end - - N = 1000 - for i in 1:5 - d = Normal(0.0, i) - data = rand(d, N) - chn = sample( - demo(data), NUTS(0.65; adtype=AutoReverseDiff(; compile=true)), 1000 - ) - @test mean(Array(chn[:sigma])) ≈ std(data) atol = 0.5 - end - end - - @testset "ReverseDiff compiled without linking" begin - f = DynamicPPL.LogDensityFunction(gdemo_default) - θ = DynamicPPL.getparams(f) - - f_rd = LogDensityProblemsAD.ADgradient(Turing.AutoReverseDiff(; compile=false), f) - f_rd_compiled = LogDensityProblemsAD.ADgradient( - Turing.AutoReverseDiff(; compile=true), f - ) - - ℓ, ℓ_grad = LogDensityProblems.logdensity_and_gradient(f_rd, θ) - ℓ_compiled, ℓ_grad_compiled = LogDensityProblems.logdensity_and_gradient( - f_rd_compiled, θ - ) - - @test ℓ == ℓ_compiled - @test ℓ_grad == ℓ_grad_compiled - end -end - -end diff --git a/test/ext/OptimInterface.jl b/test/ext/OptimInterface.jl index 56d53a14dc..163cf36c53 100644 --- a/test/ext/OptimInterface.jl +++ b/test/ext/OptimInterface.jl @@ -143,7 +143,6 @@ using Turing DynamicPPL.TestUtils.demo_assume_multivariate_observe_literal, DynamicPPL.TestUtils.demo_dot_assume_observe_submodel, DynamicPPL.TestUtils.demo_dot_assume_observe_matrix_index, - DynamicPPL.TestUtils.demo_dot_assume_matrix_dot_observe_matrix, DynamicPPL.TestUtils.demo_assume_submodel_observe_index_literal, DynamicPPL.TestUtils.demo_dot_assume_observe_index, DynamicPPL.TestUtils.demo_dot_assume_observe_index_literal, diff --git a/test/mcmc/abstractmcmc.jl b/test/mcmc/abstractmcmc.jl index 165e97eeef..82baa41197 100644 --- a/test/mcmc/abstractmcmc.jl +++ b/test/mcmc/abstractmcmc.jl @@ -8,7 +8,6 @@ using DynamicPPL: DynamicPPL using ForwardDiff: ForwardDiff using LinearAlgebra: I using LogDensityProblems: LogDensityProblems -using LogDensityProblemsAD: LogDensityProblemsAD using Random: Random using ReverseDiff: ReverseDiff using StableRNGs: StableRNG @@ -18,16 +17,12 @@ using Turing using Turing.Inference: AdvancedHMC function initialize_nuts(model::Turing.Model) - # Create a log-density function with an implementation of the - # gradient so we ensure that we're using the same AD backend as in Turing. - f = LogDensityProblemsAD.ADgradient(DynamicPPL.LogDensityFunction(model)) - - # Link the varinfo. - f = Turing.Inference.setvarinfo( - f, - DynamicPPL.link!!(Turing.Inference.getvarinfo(f), model), - Turing.Inference.getADType(LogDensityProblemsAD.parent(f).context), - ) + # Create a linked varinfo + vi = DynamicPPL.VarInfo(model) + linked_vi = DynamicPPL.link!!(vi, model) + + # Create a LogDensityFunction + f = DynamicPPL.LogDensityFunction(model, linked_vi; adtype=Turing.DEFAULT_ADTYPE) # Choose parameter dimensionality and initial parameter value D = LogDensityProblems.dimension(f) @@ -148,27 +143,6 @@ end end end end - - @testset "don't drop `ADgradient` (PR: #2223)" begin - rng = Random.default_rng() - model = DynamicPPL.TestUtils.DEMO_MODELS[1] - sampler = initialize_nuts(model) - sampler_ext = externalsampler( - sampler; unconstrained=true, adtype=AutoForwardDiff() - ) - # Initial step. - state = last( - AbstractMCMC.step(rng, model, DynamicPPL.Sampler(sampler_ext); n_adapts=0) - ) - @test state.logdensity isa LogDensityProblemsAD.ADGradientWrapper - # Subsequent step. - state = last( - AbstractMCMC.step( - rng, model, DynamicPPL.Sampler(sampler_ext), state; n_adapts=0 - ), - ) - @test state.logdensity isa LogDensityProblemsAD.ADGradientWrapper - end end @testset "AdvancedMH.jl" begin diff --git a/test/optimisation/Optimisation.jl b/test/optimisation/Optimisation.jl index 518e6f6bd3..5ec7afae40 100644 --- a/test/optimisation/Optimisation.jl +++ b/test/optimisation/Optimisation.jl @@ -545,7 +545,6 @@ using Turing DynamicPPL.TestUtils.demo_assume_multivariate_observe_literal, DynamicPPL.TestUtils.demo_dot_assume_observe_submodel, DynamicPPL.TestUtils.demo_dot_assume_observe_matrix_index, - DynamicPPL.TestUtils.demo_dot_assume_matrix_dot_observe_matrix, DynamicPPL.TestUtils.demo_assume_submodel_observe_index_literal, DynamicPPL.TestUtils.demo_dot_assume_observe_index, DynamicPPL.TestUtils.demo_dot_assume_observe_index_literal, @@ -624,16 +623,9 @@ using Turing m = DynamicPPL.contextualize( gdemo_default, ADUtils.ADTypeCheckContext(adbackend, gdemo_default.context) ) - if adbackend isa AutoForwardDiff - # TODO: Figure out why this is happening. - # https://github.com/TuringLang/Turing.jl/issues/2369 - @test_throws DivideError maximum_likelihood(m; adtype=adbackend) - @test_throws DivideError maximum_a_posteriori(m; adtype=adbackend) - else - # These will error if the adbackend being used is not the one set. - maximum_likelihood(m; adtype=adbackend) - maximum_a_posteriori(m; adtype=adbackend) - end + # These will error if the adbackend being used is not the one set. + maximum_likelihood(m; adtype=adbackend) + maximum_a_posteriori(m; adtype=adbackend) end end diff --git a/test/runtests.jl b/test/runtests.jl index 093615e540..0a948a317e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -39,7 +39,6 @@ end end @testset "essential" verbose = true begin - @timeit_include("essential/ad.jl") @timeit_include("essential/container.jl") end diff --git a/test/test_utils/ad_utils.jl b/test/test_utils/ad_utils.jl index b6a92aa96f..ee544e7ce3 100644 --- a/test/test_utils/ad_utils.jl +++ b/test/test_utils/ad_utils.jl @@ -202,7 +202,7 @@ end All the ADTypes on which we want to run the tests. """ adbackends = [ - Turing.AutoForwardDiff(; chunksize=0), + Turing.AutoForwardDiff(), Turing.AutoReverseDiff(; compile=false), Turing.AutoMooncake(; config=nothing), ]