Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Setup Accelerate and MKL for 32-bit, MKL getrf, and Metal.jl integration #361

Merged
merged 10 commits into from
Aug 10, 2023
7 changes: 5 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,19 @@ UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
HYPRE = "b5ffcf37-a2bd-41ab-a3da-4bd9bc8ad771"
MKL_jll = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
KrylovKit = "0b1a1467-8014-51b9-945f-bf0ae24f4b77"
MKL_jll = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
Pardiso = "46dd5b70-b6fb-5a00-ae2d-e8fea33afaf2"

[extensions]
LinearSolveCUDAExt = "CUDA"
LinearSolveHYPREExt = "HYPRE"
LinearSolveIterativeSolversExt = "IterativeSolvers"
LinearSolveMKLExt = "MKL_jll"
LinearSolveKrylovKitExt = "KrylovKit"
LinearSolveMKLExt = "MKL_jll"
LinearSolveMetalExt = "Metal"
LinearSolvePardisoExt = "Pardiso"

[compat]
Expand Down Expand Up @@ -74,6 +76,7 @@ IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
KrylovKit = "0b1a1467-8014-51b9-945f-bf0ae24f4b77"
MKL_jll = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
MultiFloats = "bdf0d083-296b-4888-a5b6-7498122e68a5"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Expand Down
51 changes: 51 additions & 0 deletions benchmarks/applelu.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
using BenchmarkTools, Random, VectorizationBase
using LinearAlgebra, LinearSolve, Metal
nc = min(Int(VectorizationBase.num_cores()), Threads.nthreads())
BLAS.set_num_threads(nc)
BenchmarkTools.DEFAULT_PARAMETERS.seconds = 0.5

function luflop(m, n = m; innerflop = 2)
sum(1:min(m, n)) do k
invflop = 1
scaleflop = isempty((k + 1):m) ? 0 : sum((k + 1):m)
updateflop = isempty((k + 1):n) ? 0 :
sum((k + 1):n) do j
isempty((k + 1):m) ? 0 : sum((k + 1):m) do i
innerflop
end
end
invflop + scaleflop + updateflop
end
end

algs = [LUFactorization(), GenericLUFactorization(), RFLUFactorization(), AppleAccelerateLUFactorization(), MetalLUFactorization()]
res = [Float32[] for i in 1:length(algs)]

ns = 4:8:500
for i in 1:length(ns)
n = ns[i]
@info "$n × $n"
rng = MersenneTwister(123)
global A = rand(rng, Float32, n, n)
global b = rand(rng, Float32, n)
global u0= rand(rng, Float32, n)

for j in 1:length(algs)
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
push!(res[j], luflop(n) / bt / 1e9)
end
end

using Plots
__parameterless_type(T) = Base.typename(T).wrapper
parameterless_type(x) = __parameterless_type(typeof(x))
parameterless_type(::Type{T}) where {T} = __parameterless_type(T)

p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
for i in 2:length(res)
plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
end
p

savefig("metallubench.png")
savefig("metallubench.pdf")
51 changes: 51 additions & 0 deletions benchmarks/cudalu.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
using BenchmarkTools, Random, VectorizationBase
using LinearAlgebra, LinearSolve, CUDA, MKL_jll
nc = min(Int(VectorizationBase.num_cores()), Threads.nthreads())
BLAS.set_num_threads(nc)
BenchmarkTools.DEFAULT_PARAMETERS.seconds = 0.5

function luflop(m, n = m; innerflop = 2)
sum(1:min(m, n)) do k
invflop = 1
scaleflop = isempty((k + 1):m) ? 0 : sum((k + 1):m)
updateflop = isempty((k + 1):n) ? 0 :
sum((k + 1):n) do j
isempty((k + 1):m) ? 0 : sum((k + 1):m) do i
innerflop
end
end
invflop + scaleflop + updateflop
end
end

algs = [MKLLUFactorization(), CUDAOffloadFactorization()]
ChrisRackauckas marked this conversation as resolved.
Show resolved Hide resolved
res = [Float32[] for i in 1:length(algs)]

ns = 200:400:20000
for i in 1:length(ns)
n = ns[i]
@info "$n × $n"
rng = MersenneTwister(123)
global A = rand(rng, Float32, n, n)
global b = rand(rng, Float32, n)
global u0= rand(rng, Float32, n)

for j in 1:length(algs)
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
push!(res[j], luflop(n) / bt / 1e9)
end
end

using Plots
__parameterless_type(T) = Base.typename(T).wrapper
parameterless_type(x) = __parameterless_type(typeof(x))
parameterless_type(::Type{T}) where {T} = __parameterless_type(T)

p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
for i in 2:length(res)
plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
end
p

savefig("cudaoffloadlubench.png")
savefig("cudaoffloadlubench.pdf")
51 changes: 51 additions & 0 deletions benchmarks/metallu.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
using BenchmarkTools, Random, VectorizationBase
using LinearAlgebra, LinearSolve, Metal
nc = min(Int(VectorizationBase.num_cores()), Threads.nthreads())
BLAS.set_num_threads(nc)
BenchmarkTools.DEFAULT_PARAMETERS.seconds = 0.5

function luflop(m, n = m; innerflop = 2)
sum(1:min(m, n)) do k
invflop = 1
scaleflop = isempty((k + 1):m) ? 0 : sum((k + 1):m)
updateflop = isempty((k + 1):n) ? 0 :
sum((k + 1):n) do j
isempty((k + 1):m) ? 0 : sum((k + 1):m) do i
innerflop
end
end
invflop + scaleflop + updateflop
end
end

algs = [AppleAccelerateLUFactorization(), MetalLUFactorization()]
res = [Float32[] for i in 1:length(algs)]

ns = 200:400:20000
for i in 1:length(ns)
n = ns[i]
@info "$n × $n"
rng = MersenneTwister(123)
global A = rand(rng, Float32, n, n)
global b = rand(rng, Float32, n)
global u0= rand(rng, Float32, n)

for j in 1:length(algs)
bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
push!(res[j], luflop(n) / bt / 1e9)
end
end

using Plots
__parameterless_type(T) = Base.typename(T).wrapper
parameterless_type(x) = __parameterless_type(typeof(x))
parameterless_type(::Type{T}) where {T} = __parameterless_type(T)

p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
for i in 2:length(res)
plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
end
p

savefig("metal_large_lubench.png")
savefig("metal_large_lubench.pdf")
65 changes: 60 additions & 5 deletions docs/src/solvers/solvers.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,37 @@ Solves for ``Au=b`` in the problem defined by `prob` using the algorithm

## Recommended Methods

### Dense Matrices

The default algorithm `nothing` is good for picking an algorithm that will work,
but one may need to change this to receive more performance or precision. If
more precision is necessary, `QRFactorization()` and `SVDFactorization()` are
the best choices, with SVD being the slowest but most precise.

For efficiency, `RFLUFactorization` is the fastest for dense LU-factorizations.
`FastLUFactorization` will be faster than `LUFactorization` which is the Base.LinearAlgebra
(`\` default) implementation of LU factorization. `SimpleLUFactorization` will be fast
on very small matrices.
For efficiency, `RFLUFactorization` is the fastest for dense LU-factorizations until around
150x150 matrices, though this can be dependent on the exact details of the hardware. After this
point, `MKLLUFactorization` is usually faster on most hardware. Note that on Mac computers
that `AppleAccelerateLUFactorization` is generally always the fastest. `LUFactorization` will
use your base system BLAS which can be fast or slow depending on the hardware configuration.
`SimpleLUFactorization` will be fast only on very small matrices but can cut down on compile times.

For very large dense factorizations, offloading to the GPU can be preferred. Metal.jl can be used
on Mac hardware to offload, and has a cutoff point of being faster at around size 20,000 x 20,000
matrices (and only supports Float32). `CudaOffloadFactorization` can be more efficient at a
much smaller cutoff, possibly around size 1,000 x 1,000 matrices, though this is highly dependent
on the chosen GPU hardware. `CudaOffloadFactorization` requires a CUDA-compatible NVIDIA GPU.
CUDA offload supports Float64 but most consumer GPU hardware will be much faster on Float32
(many are >32x faster for Float32 operations than Float64 operations) and thus for most hardware
this is only recommended for Float32 matrices.

!!! note

Performance details for dense LU-factorizations can be highly dependent on the hardware configuration.
For details see [this issue](https://github.com/SciML/LinearSolve.jl/issues/357).
If one is looking to best optimize their system, we suggest running the performance
tuning benchmark.

### Sparse Matrices

For sparse LU-factorizations, `KLUFactorization` if there is less structure
to the sparsity pattern and `UMFPACKFactorization` if there is more structure.
Expand All @@ -31,12 +53,25 @@ As sparse matrices get larger, iterative solvers tend to get more efficient than
factorization methods if a lower tolerance of the solution is required.

Krylov.jl generally outperforms IterativeSolvers.jl and KrylovKit.jl, and is compatible
with CPUs and GPUs, and thus is the generally preferred form for Krylov methods.
with CPUs and GPUs, and thus is the generally preferred form for Krylov methods. The
choice of Krylov method should be the one most constrained to the type of operator one
has, for example if positive definite then `Krylov_CG()`, but if no good properties then
use `Krylov_GMRES()`.

Finally, a user can pass a custom function for handling the linear solve using
`LinearSolveFunction()` if existing solvers are not optimally suited for their application.
The interface is detailed [here](@ref custom).

### Lazy SciMLOperators

If the linear operator is given as a lazy non-concrete operator, such as a `FunctionOperator`,
then using a Krylov method is preferred in order to not concretize the matrix.
Krylov.jl generally outperforms IterativeSolvers.jl and KrylovKit.jl, and is compatible
with CPUs and GPUs, and thus is the generally preferred form for Krylov methods. The
choice of Krylov method should be the one most constrained to the type of operator one
has, for example if positive definite then `Krylov_CG()`, but if no good properties then
use `Krylov_GMRES()`.

## Full List of Methods

### RecursiveFactorization.jl
Expand Down Expand Up @@ -121,6 +156,26 @@ KrylovJL
MKLLUFactorization
```

### AppleAccelerate.jl

!!! note

Using this solver requires a Mac with Apple Accelerate. This should come standard in most "modern" Mac computers.

```@docs
AppleAccelerateLUFactorization
```

### Metal.jl

!!! note

Using this solver requires adding the package Metal.jl, i.e. `using Metal`. This package is only compatible with Mac M-Series computers with a Metal-compatible GPU.

```@docs
MetalLUFactorization
```

### Pardiso.jl

!!! note
Expand Down
73 changes: 71 additions & 2 deletions ext/LinearSolveMKLExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,63 @@
A, ipiv, info[], info #Error code is stored in LU factorization type
end

function getrf!(A::AbstractMatrix{<:Float32}; ipiv = similar(A, BlasInt, min(size(A,1),size(A,2))), info = Ref{BlasInt}(), check = false)
require_one_based_indexing(A)
check && chkfinite(A)
chkstride1(A)
m, n = size(A)
lda = max(1,stride(A, 2))
if isempty(ipiv)
ipiv = similar(A, BlasInt, min(size(A,1),size(A,2)))

Check warning on line 37 in ext/LinearSolveMKLExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveMKLExt.jl#L30-L37

Added lines #L30 - L37 were not covered by tests
end
ccall((@blasfunc(sgetrf_), MKL_jll.libmkl_rt), Cvoid,

Check warning on line 39 in ext/LinearSolveMKLExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveMKLExt.jl#L39

Added line #L39 was not covered by tests
(Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32},
Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}),
m, n, A, lda, ipiv, info)
chkargsok(info[])
A, ipiv, info[], info #Error code is stored in LU factorization type

Check warning on line 44 in ext/LinearSolveMKLExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveMKLExt.jl#L43-L44

Added lines #L43 - L44 were not covered by tests
end

function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float64}, ipiv::AbstractVector{Cint}, B::AbstractVecOrMat{<:Float64}; info = Ref{Cint}())
require_one_based_indexing(A, ipiv, B)
LinearAlgebra.LAPACK.chktrans(trans)
chkstride1(A, B, ipiv)
n = LinearAlgebra.checksquare(A)
if n != size(B, 1)
throw(DimensionMismatch("B has leading dimension $(size(B,1)), but needs $n"))

Check warning on line 53 in ext/LinearSolveMKLExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveMKLExt.jl#L47-L53

Added lines #L47 - L53 were not covered by tests
end
if n != length(ipiv)
throw(DimensionMismatch("ipiv has length $(length(ipiv)), but needs to be $n"))

Check warning on line 56 in ext/LinearSolveMKLExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveMKLExt.jl#L55-L56

Added lines #L55 - L56 were not covered by tests
end
nrhs = size(B, 2)
ccall(("dgetrs_", MKL_jll.libmkl_rt), Cvoid,

Check warning on line 59 in ext/LinearSolveMKLExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveMKLExt.jl#L58-L59

Added lines #L58 - L59 were not covered by tests
(Ref{UInt8}, Ref{Cint}, Ref{Cint}, Ptr{Float64}, Ref{Cint},
Ptr{Cint}, Ptr{Float64}, Ref{Cint}, Ptr{Cint}, Clong),
trans, n, size(B,2), A, max(1,stride(A,2)), ipiv, B, max(1,stride(B,2)), info, 1)
LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[]))
B

Check warning on line 64 in ext/LinearSolveMKLExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveMKLExt.jl#L63-L64

Added lines #L63 - L64 were not covered by tests
end

function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float32}, ipiv::AbstractVector{Cint}, B::AbstractVecOrMat{<:Float32}; info = Ref{Cint}())
require_one_based_indexing(A, ipiv, B)
LinearAlgebra.LAPACK.chktrans(trans)
chkstride1(A, B, ipiv)
n = LinearAlgebra.checksquare(A)
if n != size(B, 1)
throw(DimensionMismatch("B has leading dimension $(size(B,1)), but needs $n"))

Check warning on line 73 in ext/LinearSolveMKLExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveMKLExt.jl#L67-L73

Added lines #L67 - L73 were not covered by tests
end
if n != length(ipiv)
throw(DimensionMismatch("ipiv has length $(length(ipiv)), but needs to be $n"))

Check warning on line 76 in ext/LinearSolveMKLExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveMKLExt.jl#L75-L76

Added lines #L75 - L76 were not covered by tests
end
nrhs = size(B, 2)
ccall(("sgetrs_", MKL_jll.libmkl_rt), Cvoid,

Check warning on line 79 in ext/LinearSolveMKLExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveMKLExt.jl#L78-L79

Added lines #L78 - L79 were not covered by tests
(Ref{UInt8}, Ref{Cint}, Ref{Cint}, Ptr{Float32}, Ref{Cint},
Ptr{Cint}, Ptr{Float32}, Ref{Cint}, Ptr{Cint}, Clong),
trans, n, size(B,2), A, max(1,stride(A,2)), ipiv, B, max(1,stride(B,2)), info, 1)
LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[]))
B

Check warning on line 84 in ext/LinearSolveMKLExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveMKLExt.jl#L83-L84

Added lines #L83 - L84 were not covered by tests
end

default_alias_A(::MKLLUFactorization, ::Any, ::Any) = false
default_alias_b(::MKLLUFactorization, ::Any, ::Any) = false

Expand All @@ -47,8 +104,20 @@
cache.cacheval = fact
cache.isfresh = false
end
y = ldiv!(cache.u, @get_cacheval(cache, :MKLLUFactorization)[1], cache.b)
SciMLBase.build_linear_solution(alg, y, nothing, cache)

A, info = @get_cacheval(cache, :MKLLUFactorization)
LinearAlgebra.require_one_based_indexing(cache.u, cache.b)
m, n = size(A, 1), size(A, 2)
if m > n
Bc = copy(cache.b)
getrs!('N', A.factors, A.ipiv, Bc; info)
return copyto!(cache.u, 1, Bc, 1, n)

Check warning on line 114 in ext/LinearSolveMKLExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveMKLExt.jl#L108-L114

Added lines #L108 - L114 were not covered by tests
else
copyto!(cache.u, cache.b)
getrs!('N', A.factors, A.ipiv, cache.u; info)

Check warning on line 117 in ext/LinearSolveMKLExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveMKLExt.jl#L116-L117

Added lines #L116 - L117 were not covered by tests
end

SciMLBase.build_linear_solution(alg, cache.u, nothing, cache)

Check warning on line 120 in ext/LinearSolveMKLExt.jl

View check run for this annotation

Codecov / codecov/patch

ext/LinearSolveMKLExt.jl#L120

Added line #L120 was not covered by tests
end

end
Loading
Loading