Support ADTypes and autodiff by default (#178)

* Remove ForwardDiff as dependency * Add ADTypes as dependency * Add adtype keyword to singlepath * Refactor utility functions to use AD if available * Support OptimizationFunction constructed with adtype * Support adtype in multipathfinder * Support adtype for Turing models * Remove error checks for methods that now work * Increment patch number * Don't set chunksize to 0 Optimization.jl doesn't like this * Don't assume parameters supported Seems to be optional, and Turing models don't support it * Remove type unknown to extension * Remove no-longer-needed check * Remove build_optim_problem * Unify LogDensityProblems methods * Only include test_utils.jl once * Unify example LogDensityProblems * Update tests * Remove adtype constraint to allow `NoAD` * Bump compats to Optimization and Turing versions with ADTypes * Bump deps lower bounds to match those of Turing/Optimization * Bump ForwardDiff lower bound for tests * Bump Optim compat to support Compat v4 * Bump lower bounds for DynamicPPL/Compat compat * Bump compats in other environments to match * Bump lower bounds for docs for DynamicPPL compatibility * Skip downgrading Folds and Transducers for docs/integrations Increasing these lower bounds ultimately produces a conflict since they require BangBang v0.4, but no Turing version yet is compatible with this version. * Fix test * Update since ∇f removed from callback * Bump SciMLBase lower bound to support ADTypes v0.2 * Fail on non-finite gradient only for newer Optimization.jl versions * Bump OptimizationOptimJL lower bound * Bump Turing lower bound * Bump TransformVariables lower bound * Add AbstractADType type constraint * Update docstring * Note ADTypes support * Load ADTypes * Remove unneeded call to `dim` * Add ADTypes and ReverseDiff as docs deps * Update quickstart to use and describe ADTypes * Bump ReverseDiff compat lower bound * Increment version number * Test adtype is correctly used if function provided * Bump Pathfinder compats for docs and integration tests * Avoid deploying docs for downgrade builds
mlcolab · Jul 2, 2024 · c334ef7 · c334ef7
1 parent fae6c4b
commit c334ef7
Show file tree

Hide file tree

Showing 23 changed files with 315 additions and 273 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -76,7 +76,7 @@ jobs:
           version: ${{ matrix.version }}
       - uses: julia-actions/[email protected]
         with:
-          skip: LinearAlgebra,Pathfinder,Random,Statistics
+          skip: LinearAlgebra,Pathfinder,Random,Statistics,Folds,Transducers
           projects: ., docs
         if: matrix.downgrade
         name: Downgrade dependencies to oldest supported versions
@@ -86,3 +86,5 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           DOCUMENTER_KEY: ${{ matrix.version == '1' && secrets.DOCUMENTER_KEY || '' }}
           GKSwstype: "100" # https://discourse.julialang.org/t/generation-of-documentation-fails-qt-qpa-xcb-could-not-connect-to-display/60988
+          # don't deploy if it's a downgrade build
+          DEPLOY_DOCS: ${{ !matrix.downgrade }}
diff --git a/.github/workflows/IntegrationTests.yml b/.github/workflows/IntegrationTests.yml
@@ -27,7 +27,7 @@ jobs:
           arch: x64
       - uses: julia-actions/[email protected]
         with:
-          skip: LinearAlgebra,Pathfinder,Random,Statistics
+          skip: LinearAlgebra,Pathfinder,Random,Statistics,Folds,Transducers
           projects: ., test/integration/${{ matrix.package }}
         if: ${{ matrix.downgrade }}
         name: Downgrade dependencies to oldest supported versions

diff --git a/Project.toml b/Project.toml
@@ -1,12 +1,12 @@
 name = "Pathfinder"
 uuid = "b1d3bc72-d0e7-4279-b92f-7fa5d6d2d454"
 authors = ["Seth Axen <[email protected]> and contributors"]
-version = "0.8.8"
+version = "0.9.0-DEV"
 
 [deps]
+ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Folds = "41a02a25-b8f0-4f67-bc48-60067656b558"
-ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 IrrationalConstants = "92d709cd-6900-40b7-9082-c6be49f344b6"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LogDensityProblems = "6fdf6af0-433a-55f7-b3ed-c6c6e0b8df7c"
@@ -36,40 +36,44 @@ PathfinderDynamicHMCExt = "DynamicHMC"
 PathfinderTuringExt = ["Accessors", "DynamicPPL", "MCMCChains", "Turing"]
 
 [compat]
-Accessors = "0.1.1"
-Distributions = "0.25.57"
+Accessors = "0.1.12"
+ADTypes = "0.2"
+Distributions = "0.25.87"
 DynamicHMC = "3.4.0"
-DynamicPPL = "0.21.5, 0.22, 0.23, 0.24, 0.25, 0.27"
+DynamicPPL = "0.24.7, 0.25, 0.27"
 Folds = "0.2.2"
-ForwardDiff = "0.10.13"
+ForwardDiff = "0.10.19"
 IrrationalConstants = "0.1.1, 0.2"
 LinearAlgebra = "1.6"
 LogDensityProblems = "2.1.0"
-MCMCChains = "5.3.0, 6"
-Optim = "1.6.2"
-Optimization = "3.5"
-OptimizationOptimJL = "0.1.1, 0.2, 0.3"
+MCMCChains = "6.0.2"
+Optim = "1.7.2"
+Optimization = "3.16.0"
+OptimizationOptimJL = "0.1.7, 0.2, 0.3"
 PDMats = "0.11.26"
 PSIS = "0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9"
 ProgressLogging = "0.1.4"
 Random = "1.6"
 Requires = "1.1"
-SciMLBase = "1.61.2, 2"
+ReverseDiff = "1.4.5"
+SciMLBase = "1.95.0, 2"
 Statistics = "1.6"
 StatsBase = "0.33.7, 0.34"
 Transducers = "0.4.66"
-Turing = "0.24.2, 0.25, 0.26, 0.27, 0.28, 0.29, 0.30, 0.31, 0.32"
+Turing = "0.30.5, 0.31, 0.32"
 UnPack = "1"
 julia = "1.6"
 
 [extras]
 Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
 DynamicHMC = "bbc10e6e-7c05-544b-b16e-64fede858acb"
 DynamicPPL = "366bfd00-2699-11ea-058f-f148b4cae6d8"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 MCMCChains = "c7f686f2-ff18-58e9-bc7b-31028e88f75d"
 OptimizationNLopt = "4e6fcdb7-1186-4e1f-a706-475e75c168bb"
+ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Turing = "fce5fe82-541a-59a6-adf8-730c64b5f9a0"
 
 [targets]
-test = ["OptimizationNLopt", "Test"]
+test = ["ForwardDiff", "OptimizationNLopt", "ReverseDiff", "Test"]
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,4 +1,5 @@
 [deps]
+ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
 AdvancedHMC = "0bf59076-c3b1-5ca4-86bd-e02cd72cde3d"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 DynamicHMC = "bbc10e6e-7c05-544b-b16e-64fede858acb"
@@ -8,22 +9,25 @@ LogDensityProblems = "6fdf6af0-433a-55f7-b3ed-c6c6e0b8df7c"
 LogDensityProblemsAD = "996a588d-648d-4e1f-a8f0-a84b347e47b1"
 Pathfinder = "b1d3bc72-d0e7-4279-b92f-7fa5d6d2d454"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
 StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
 TransformVariables = "84d833dd-6860-57f9-a1a7-6da5db126cff"
 TransformedLogDensities = "f9bc47f6-f3f8-4f3b-ab21-f8bc73906f26"
 Turing = "fce5fe82-541a-59a6-adf8-730c64b5f9a0"
 
 [compat]
-AdvancedHMC = "0.4, 0.5, 0.6"
+ADTypes = "0.2"
+AdvancedHMC = "0.6"
 Documenter = "1"
 DynamicHMC = "3.4.0"
-ForwardDiff = "0.10.13"
+ForwardDiff = "0.10.19"
 LogDensityProblems = "2.1.0"
-LogDensityProblemsAD = "1.4"
-Pathfinder = "0.8"
+LogDensityProblemsAD = "1.7"
+Pathfinder = "0.9"
+ReverseDiff = "1.4.5"
 StatsFuns = "1"
 StatsPlots = "0.14.21, 0.15"
-TransformVariables = "0.6, 0.7, 0.8"
+TransformVariables = "0.6.2, 0.7, 0.8"
 TransformedLogDensities = "1.0.2"
-Turing = "0.24.2, 0.25, 0.26, 0.27, 0.28, 0.29, 0.30, 0.31, 0.32"
+Turing = "0.30.5, 0.31, 0.32"
diff --git a/docs/make.jl b/docs/make.jl
@@ -24,4 +24,8 @@ makedocs(;
     ],
 )
 
-deploydocs(; repo="github.com/mlcolab/Pathfinder.jl", devbranch="main", push_preview=true)
+if get(ENV, "DEPLOY_DOCS", "true") == "true"
+    deploydocs(;
+        repo="github.com/mlcolab/Pathfinder.jl", devbranch="main", push_preview=true
+    )
+end
diff --git a/docs/src/examples/quickstart.md b/docs/src/examples/quickstart.md
@@ -6,26 +6,21 @@ This page introduces basic Pathfinder usage with examples.
 
 For a simple example, we'll run Pathfinder on a multivariate normal distribution with
 a dense covariance matrix.
-Pathfinder expects an object that implements the [LogDensityProblems](https://www.tamaspapp.eu/LogDensityProblems.jl) interface and has a gradient implemented.
-We can use automatic differentiation to compute the gradient using [LogDensityProblemsAD](https://github.com/tpapp/LogDensityProblemsAD.jl).
+Pathfinder can take a log-density function.
+By default, the gradient of the log-density function is computed using ForwardDiff.
 
 ```@example 1
-using ForwardDiff, LinearAlgebra, LogDensityProblems, LogDensityProblemsAD,
-      Pathfinder, Printf, StatsPlots, Random
+using ADTypes, ForwardDiff, LinearAlgebra, LogDensityProblems,
+      Pathfinder, Printf, ReverseDiff, StatsPlots, Random
 Random.seed!(42)
 
-ForwardDiff, LogDensityProblems, LogDensityProblemsAD
 struct MvNormalProblem{T,S}
     μ::T  # mean
     P::S  # precision matrix
 end
-function LogDensityProblems.capabilities(::Type{<:MvNormalProblem})
-    return LogDensityProblems.LogDensityOrder{0}()
-end
-LogDensityProblems.dimension(ℓ::MvNormalProblem) = length(ℓ.μ)
-function LogDensityProblems.logdensity(ℓ::MvNormalProblem, x)
-    z = x - μ
-    return -dot(z, P, z) / 2
+function (prob::MvNormalProblem)(x)
+    z = x - prob.μ
+    return -dot(z, prob.P, z) / 2
 end
 
 Σ = [
@@ -37,15 +32,15 @@ end
 ]
 μ = [-0.55, 0.49, -0.76, 0.25, 0.94]
 P = inv(Symmetric(Σ))
-prob_mvnormal = ADgradient(:ForwardDiff, MvNormalProblem(μ, P))
+prob_mvnormal = MvNormalProblem(μ, P)
 
 nothing # hide
 ```
 
 Now we run [`pathfinder`](@ref).
 
 ```@example 1
-result = pathfinder(prob_mvnormal; init_scale=4)
+result = pathfinder(prob_mvnormal; dim=5, init_scale=4)
 ```
 
 `result` is a [`PathfinderResult`](@ref).
@@ -126,21 +121,30 @@ Now we will run Pathfinder on the following banana-shaped distribution with dens
 \pi(x_1, x_2) = e^{-x_1^2 / 2} e^{-5 (x_2 - x_1^2)^2 / 2}.
 ```
 
+Pathfinder can also take any object that implements the [LogDensityProblems](https://www.tamaspapp.eu/LogDensityProblems.jl) interface.
+This can also be used to manually define the gradient of the log-density function.
+
 First we define the log density problem:
 
 ```@example 1
 Random.seed!(23)
 
 struct BananaProblem end
 function LogDensityProblems.capabilities(::Type{<:BananaProblem})
-    return LogDensityProblems.LogDensityOrder{0}()
+    return LogDensityProblems.LogDensityOrder{1}()
 end
-LogDensityProblems.dimension(ℓ::BananaProblem) = 2
-function LogDensityProblems.logdensity(ℓ::BananaProblem, x)
+LogDensityProblems.dimension(::BananaProblem) = 2
+function LogDensityProblems.logdensity(::BananaProblem, x)
     return -(x[1]^2 + 5(x[2] - x[1]^2)^2) / 2
 end
+function LogDensityProblems.logdensity_and_gradient(::BananaProblem, x)
+    a = (x[2] - x[1]^2)
+    lp = -(x[1]^2 + 5a^2) / 2
+    grad_lp = [(10a - 1) * x[1], -5a]
+    return lp, grad_lp
+end
 
-prob_banana = ADgradient(:ForwardDiff, BananaProblem())
+prob_banana = BananaProblem()
 
 nothing # hide
 ```
@@ -218,6 +222,8 @@ Multi-path Pathfinder can't sample the funnel well, but it can quickly give us d
 In this example, we draw from a 100-dimensional funnel and visualize 2 dimensions.
 
 ```@example 1
+using ReverseDiff, ADTypes
+
 Random.seed!(68)
 
 function logp_funnel(x)
@@ -227,23 +233,16 @@ function logp_funnel(x)
     return ((τ / 3)^2 + (n - 1) * τ + sum(b -> abs2(b * exp(-τ / 2)), β)) / -2
 end
 
-struct FunnelProblem
-    dim::Int
-end
-function LogDensityProblems.capabilities(::Type{<:FunnelProblem})
-    return LogDensityProblems.LogDensityOrder{0}()
-end
-LogDensityProblems.dimension(ℓ::FunnelProblem) = ℓ.dim
-LogDensityProblems.logdensity(::FunnelProblem, x) = logp_funnel(x)
-
-prob_funnel = ADgradient(:ForwardDiff, FunnelProblem(100))
 nothing # hide
 ```
 
 First, let's fit this posterior with single-path Pathfinder.
+For high-dimensional problems, it's better to use reverse-mode automatic differentiation.
+Here, we'll use `ADTypes.AutoReverseDiff()` to specify that [ReverseDiff.jl](https://github.com/JuliaDiff/ReverseDiff.jl) should be used.
+
 
 ```@example 1
-result_single = pathfinder(prob_funnel; init_scale=10)
+result_single = pathfinder(logp_funnel; dim=100, init_scale=10, adtype=AutoReverseDiff())
 ```
 
 Let's visualize this sequence of multivariate normals for the first two dimensions.
@@ -268,7 +267,7 @@ Now we run [`multipathfinder`](@ref).
 
 ```@example 1
 ndraws = 1_000
-result = multipathfinder(prob_funnel, ndraws; nruns=20, init_scale=10)
+result = multipathfinder(logp_funnel, ndraws; dim=100, nruns=20, init_scale=10, adtype=AutoReverseDiff())
 ```
 
 Again, the poor Pareto shape diagnostic indicates we should run MCMC to get draws suitable for computing posterior estimates.

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -40,6 +40,7 @@ See [Initializing HMC with Pathfinder](@ref) for details.
 Pathfinder uses several packages for extended functionality:
 
 - [Optimization.jl](https://optimization.sciml.ai/stable/): This allows the L-BFGS optimizer to be replaced with any of the many Optimization-compatible optimizers and supports use of callbacks. Note that any changes made to Pathfinder using these features would be experimental.
+- [ADTypes.jl](https://sciml.github.io/ADTypes.jl/stable/): Supports specifying the automatic differentiation engine to be used for computing gradient and Hessian, if needed.
 - [Transducers.jl](https://juliafolds.github.io/Transducers.jl/stable/): parallelization support
 - [Distributions.jl](https://juliastats.org/Distributions.jl/stable/)/[PDMats.jl](https://github.com/JuliaStats/PDMats.jl): fits can be used anywhere a `Distribution` can be used
 - [LogDensityProblems.jl](https://www.tamaspapp.eu/LogDensityProblems.jl/stable/): defining the log-density function, gradient, and Hessian

diff --git a/ext/PathfinderTuringExt.jl b/ext/PathfinderTuringExt.jl
@@ -2,6 +2,7 @@ module PathfinderTuringExt
 
 if isdefined(Base, :get_extension)
     using Accessors: Accessors
+    using ADTypes: ADTypes
     using DynamicPPL: DynamicPPL
     using MCMCChains: MCMCChains
     using Pathfinder: Pathfinder
@@ -10,6 +11,7 @@ if isdefined(Base, :get_extension)
     import Pathfinder: flattened_varnames_list
 else  # using Requires
     using ..Accessors: Accessors
+    using ..ADTypes: ADTypes
     using ..DynamicPPL: DynamicPPL
     using ..MCMCChains: MCMCChains
     using ..Pathfinder: Pathfinder
@@ -107,10 +109,13 @@ function Pathfinder.pathfinder(
     init_scale=2,
     init_sampler=Pathfinder.UniformSampler(init_scale),
     init=nothing,
+    adtype::ADTypes.AbstractADType=Pathfinder.default_ad(),
     kwargs...,
 )
     var_names = flattened_varnames_list(model)
-    prob = Turing.optim_problem(model, Turing.MAP(); constrained=false, init_theta=init)
+    prob = Turing.optim_problem(
+        model, Turing.MAP(); constrained=false, init_theta=init, adtype
+    )
     init_sampler(rng, prob.prob.u0)
     result = Pathfinder.pathfinder(prob.prob; rng, input=model, kwargs...)
     draws = reduce(vcat, transpose.(prob.transform.(eachcol(result.draws))))
@@ -126,10 +131,11 @@ function Pathfinder.multipathfinder(
     init_scale=2,
     init_sampler=Pathfinder.UniformSampler(init_scale),
     nruns::Int,
+    adtype=Pathfinder.default_ad(),
     kwargs...,
 )
     var_names = flattened_varnames_list(model)
-    fun = Turing.optim_function(model, Turing.MAP(); constrained=false)
+    fun = Turing.optim_function(model, Turing.MAP(); constrained=false, adtype)
     init1 = fun.init()
     init = [init_sampler(rng, init1)]
     for _ in 2:nruns

diff --git a/src/Pathfinder.jl b/src/Pathfinder.jl
@@ -1,9 +1,8 @@
 module Pathfinder
 
+using ADTypes: ADTypes
 using Distributions: Distributions
 using Folds: Folds
-# ensure that ForwardDiff is conditionally loaded by Optimization
-using ForwardDiff: ForwardDiff
 using IrrationalConstants: log2π
 using LinearAlgebra
 using LogDensityProblems: LogDensityProblems
@@ -37,6 +36,9 @@ function default_optimizer(history_length)
     )
 end
 
+# We depend on Optim, and Optim depends on ForwardDiff, so we can offer it as a default.
+default_ad() = ADTypes.AutoForwardDiff()
+
 include("transducers.jl")
 include("woodbury.jl")
 include("optimize.jl")