SciML · ChrisRackauckas · Aug 10, 2023 · Aug 9, 2023 · Aug 9, 2023 · Aug 9, 2023
diff --git a/Project.toml b/Project.toml
@@ -30,17 +30,19 @@ UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
 [weakdeps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 HYPRE = "b5ffcf37-a2bd-41ab-a3da-4bd9bc8ad771"
-MKL_jll = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
 IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
 KrylovKit = "0b1a1467-8014-51b9-945f-bf0ae24f4b77"
+MKL_jll = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 Pardiso = "46dd5b70-b6fb-5a00-ae2d-e8fea33afaf2"
 
 [extensions]
 LinearSolveCUDAExt = "CUDA"
 LinearSolveHYPREExt = "HYPRE"
 LinearSolveIterativeSolversExt = "IterativeSolvers"
-LinearSolveMKLExt = "MKL_jll"
 LinearSolveKrylovKitExt = "KrylovKit"
+LinearSolveMKLExt = "MKL_jll"
+LinearSolveMetalExt = "Metal"
 LinearSolvePardisoExt = "Pardiso"
 
 [compat]
@@ -74,6 +76,7 @@ IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
 JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
 KrylovKit = "0b1a1467-8014-51b9-945f-bf0ae24f4b77"
 MKL_jll = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 MultiFloats = "bdf0d083-296b-4888-a5b6-7498122e68a5"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"

diff --git a/benchmarks/applelu.jl b/benchmarks/applelu.jl
@@ -0,0 +1,51 @@
+using BenchmarkTools, Random, VectorizationBase
+using LinearAlgebra, LinearSolve, Metal
+nc = min(Int(VectorizationBase.num_cores()), Threads.nthreads())
+BLAS.set_num_threads(nc)
+BenchmarkTools.DEFAULT_PARAMETERS.seconds = 0.5
+
+function luflop(m, n = m; innerflop = 2)
+    sum(1:min(m, n)) do k
+        invflop = 1
+        scaleflop = isempty((k + 1):m) ? 0 : sum((k + 1):m)
+        updateflop = isempty((k + 1):n) ? 0 :
+                     sum((k + 1):n) do j
+            isempty((k + 1):m) ? 0 : sum((k + 1):m) do i
+                innerflop
+            end
+        end
+        invflop + scaleflop + updateflop
+    end
+end
+
+algs = [LUFactorization(), GenericLUFactorization(), RFLUFactorization(), AppleAccelerateLUFactorization(), MetalLUFactorization()]
+res = [Float32[] for i in 1:length(algs)]
+
+ns = 4:8:500
+for i in 1:length(ns)
+    n = ns[i]
+    @info "$n × $n"
+    rng = MersenneTwister(123)
+    global A = rand(rng, Float32, n, n)
+    global b = rand(rng, Float32, n)
+    global u0= rand(rng, Float32, n)
+
+    for j in 1:length(algs)
+        bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
+        push!(res[j], luflop(n) / bt / 1e9)
+    end
+end
+
+using Plots
+__parameterless_type(T) = Base.typename(T).wrapper
+parameterless_type(x) = __parameterless_type(typeof(x))
+parameterless_type(::Type{T}) where {T} = __parameterless_type(T)
+
+p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
+for i in 2:length(res)
+    plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
+end
+p
+
+savefig("metallubench.png")
+savefig("metallubench.pdf")
diff --git a/benchmarks/cudalu.jl b/benchmarks/cudalu.jl
@@ -0,0 +1,51 @@
+using BenchmarkTools, Random, VectorizationBase
+using LinearAlgebra, LinearSolve, CUDA, MKL_jll
+nc = min(Int(VectorizationBase.num_cores()), Threads.nthreads())
+BLAS.set_num_threads(nc)
+BenchmarkTools.DEFAULT_PARAMETERS.seconds = 0.5
+
+function luflop(m, n = m; innerflop = 2)
+    sum(1:min(m, n)) do k
+        invflop = 1
+        scaleflop = isempty((k + 1):m) ? 0 : sum((k + 1):m)
+        updateflop = isempty((k + 1):n) ? 0 :
+                     sum((k + 1):n) do j
+            isempty((k + 1):m) ? 0 : sum((k + 1):m) do i
+                innerflop
+            end
+        end
+        invflop + scaleflop + updateflop
+    end
+end
+
+algs = [MKLLUFactorization(), CUDAOffloadFactorization()]
+res = [Float32[] for i in 1:length(algs)]
+
+ns = 200:400:20000
+for i in 1:length(ns)
+    n = ns[i]
+    @info "$n × $n"
+    rng = MersenneTwister(123)
+    global A = rand(rng, Float32, n, n)
+    global b = rand(rng, Float32, n)
+    global u0= rand(rng, Float32, n)
+
+    for j in 1:length(algs)
+        bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
+        push!(res[j], luflop(n) / bt / 1e9)
+    end
+end
+
+using Plots
+__parameterless_type(T) = Base.typename(T).wrapper
+parameterless_type(x) = __parameterless_type(typeof(x))
+parameterless_type(::Type{T}) where {T} = __parameterless_type(T)
+
+p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
+for i in 2:length(res)
+    plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
+end
+p
+
+savefig("cudaoffloadlubench.png")
+savefig("cudaoffloadlubench.pdf")
diff --git a/benchmarks/metallu.jl b/benchmarks/metallu.jl
@@ -0,0 +1,51 @@
+using BenchmarkTools, Random, VectorizationBase
+using LinearAlgebra, LinearSolve, Metal
+nc = min(Int(VectorizationBase.num_cores()), Threads.nthreads())
+BLAS.set_num_threads(nc)
+BenchmarkTools.DEFAULT_PARAMETERS.seconds = 0.5
+
+function luflop(m, n = m; innerflop = 2)
+    sum(1:min(m, n)) do k
+        invflop = 1
+        scaleflop = isempty((k + 1):m) ? 0 : sum((k + 1):m)
+        updateflop = isempty((k + 1):n) ? 0 :
+                     sum((k + 1):n) do j
+            isempty((k + 1):m) ? 0 : sum((k + 1):m) do i
+                innerflop
+            end
+        end
+        invflop + scaleflop + updateflop
+    end
+end
+
+algs = [AppleAccelerateLUFactorization(), MetalLUFactorization()]
+res = [Float32[] for i in 1:length(algs)]
+
+ns = 200:400:20000
+for i in 1:length(ns)
+    n = ns[i]
+    @info "$n × $n"
+    rng = MersenneTwister(123)
+    global A = rand(rng, Float32, n, n)
+    global b = rand(rng, Float32, n)
+    global u0= rand(rng, Float32, n)
+
+    for j in 1:length(algs)
+        bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
+        push!(res[j], luflop(n) / bt / 1e9)
+    end
+end
+
+using Plots
+__parameterless_type(T) = Base.typename(T).wrapper
+parameterless_type(x) = __parameterless_type(typeof(x))
+parameterless_type(::Type{T}) where {T} = __parameterless_type(T)
+
+p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
+for i in 2:length(res)
+    plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
+end
+p
+
+savefig("metal_large_lubench.png")
+savefig("metal_large_lubench.pdf")
diff --git a/docs/src/solvers/solvers.md b/docs/src/solvers/solvers.md
@@ -7,15 +7,37 @@ Solves for ``Au=b`` in the problem defined by `prob` using the algorithm
 
 ## Recommended Methods
 
+### Dense Matrices
+
 The default algorithm `nothing` is good for picking an algorithm that will work,
 but one may need to change this to receive more performance or precision. If
 more precision is necessary, `QRFactorization()` and `SVDFactorization()` are
 the best choices, with SVD being the slowest but most precise.
 
-For efficiency, `RFLUFactorization` is the fastest for dense LU-factorizations.
-`FastLUFactorization` will be faster than `LUFactorization` which is the Base.LinearAlgebra
-(`\` default) implementation of LU factorization. `SimpleLUFactorization` will be fast
-on very small matrices.
+For efficiency, `RFLUFactorization` is the fastest for dense LU-factorizations until around 
+150x150 matrices, though this can be dependent on the exact details of the hardware. After this
+point, `MKLLUFactorization` is usually faster on most hardware. Note that on Mac computers
+that `AppleAccelerateLUFactorization` is generally always the fastest. `LUFactorization` will
+use your base system BLAS which can be fast or slow depending on the hardware configuration. 
+`SimpleLUFactorization` will be fast only on very small matrices but can cut down on compile times.
+
+For very large dense factorizations, offloading to the GPU can be preferred. Metal.jl can be used
+on Mac hardware to offload, and has a cutoff point of being faster at around size 20,000 x 20,000 
+matrices (and only supports Float32). `CudaOffloadFactorization` can be more efficient at a
+much smaller cutoff, possibly around size 1,000 x 1,000 matrices, though this is highly dependent
+on the chosen GPU hardware. `CudaOffloadFactorization` requires a CUDA-compatible NVIDIA GPU.
+CUDA offload supports Float64 but most consumer GPU hardware will be much faster on Float32
+(many are >32x faster for Float32 operations than Float64 operations) and thus for most hardware
+this is only recommended for Float32 matrices.
+
+!!! note
+
+    Performance details for dense LU-factorizations can be highly dependent on the hardware configuration. 
+    For details see [this issue](https://github.com/SciML/LinearSolve.jl/issues/357). 
+    If one is looking to best optimize their system, we suggest running the performance
+    tuning benchmark.
+
+### Sparse Matrices
 
 For sparse LU-factorizations, `KLUFactorization` if there is less structure
 to the sparsity pattern and `UMFPACKFactorization` if there is more structure.
@@ -31,12 +53,25 @@ As sparse matrices get larger, iterative solvers tend to get more efficient than
 factorization methods if a lower tolerance of the solution is required.
 
 Krylov.jl generally outperforms IterativeSolvers.jl and KrylovKit.jl, and is compatible
-with CPUs and GPUs, and thus is the generally preferred form for Krylov methods.
+with CPUs and GPUs, and thus is the generally preferred form for Krylov methods. The
+choice of Krylov method should be the one most constrained to the type of operator one
+has, for example if positive definite then `Krylov_CG()`, but if no good properties then
+use `Krylov_GMRES()`.
 
 Finally, a user can pass a custom function for handling the linear solve using
 `LinearSolveFunction()` if existing solvers are not optimally suited for their application.
 The interface is detailed [here](@ref custom).
 
+### Lazy SciMLOperators
+
+If the linear operator is given as a lazy non-concrete operator, such as a `FunctionOperator`,
+then using a Krylov method is preferred in order to not concretize the matrix. 
+Krylov.jl generally outperforms IterativeSolvers.jl and KrylovKit.jl, and is compatible
+with CPUs and GPUs, and thus is the generally preferred form for Krylov methods. The
+choice of Krylov method should be the one most constrained to the type of operator one
+has, for example if positive definite then `Krylov_CG()`, but if no good properties then
+use `Krylov_GMRES()`.
+
 ## Full List of Methods
 
 ### RecursiveFactorization.jl
@@ -121,6 +156,26 @@ KrylovJL
 MKLLUFactorization
 ```
 
+### AppleAccelerate.jl
+
+!!! note
+
+    Using this solver requires a Mac with Apple Accelerate. This should come standard in most "modern" Mac computers.
+
+```@docs
+AppleAccelerateLUFactorization
+```
+
+### Metal.jl
+
+!!! note
+
+    Using this solver requires adding the package Metal.jl, i.e. `using Metal`. This package is only compatible with Mac M-Series computers with a Metal-compatible GPU.
+
+```@docs
+MetalLUFactorization
+```
+
 ### Pardiso.jl
 
 !!! note

diff --git a/ext/LinearSolveMKLExt.jl b/ext/LinearSolveMKLExt.jl
@@ -27,6 +27,63 @@
     A, ipiv, info[], info #Error code is stored in LU factorization type
 end
 
+function getrf!(A::AbstractMatrix{<:Float32}; ipiv = similar(A, BlasInt, min(size(A,1),size(A,2))), info = Ref{BlasInt}(), check = false)
+    require_one_based_indexing(A)
+    check && chkfinite(A)
+    chkstride1(A)
+    m, n = size(A)
+    lda  = max(1,stride(A, 2))
+    if isempty(ipiv)
+        ipiv = similar(A, BlasInt, min(size(A,1),size(A,2)))
+    end
+    ccall((@blasfunc(sgetrf_), MKL_jll.libmkl_rt), Cvoid,
+            (Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32},
+            Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}),
+            m, n, A, lda, ipiv, info)
+    chkargsok(info[])
+    A, ipiv, info[], info #Error code is stored in LU factorization type
+end
+
+function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float64}, ipiv::AbstractVector{Cint}, B::AbstractVecOrMat{<:Float64}; info = Ref{Cint}())
+    require_one_based_indexing(A, ipiv, B)
+    LinearAlgebra.LAPACK.chktrans(trans)
+    chkstride1(A, B, ipiv)
+    n = LinearAlgebra.checksquare(A)
+    if n != size(B, 1)
+        throw(DimensionMismatch("B has leading dimension $(size(B,1)), but needs $n"))
+    end
+    if n != length(ipiv)
+        throw(DimensionMismatch("ipiv has length $(length(ipiv)), but needs to be $n"))
+    end
+    nrhs = size(B, 2)
+    ccall(("dgetrs_", MKL_jll.libmkl_rt), Cvoid,
+          (Ref{UInt8}, Ref{Cint}, Ref{Cint}, Ptr{Float64}, Ref{Cint},
+           Ptr{Cint}, Ptr{Float64}, Ref{Cint}, Ptr{Cint}, Clong),
+          trans, n, size(B,2), A, max(1,stride(A,2)), ipiv, B, max(1,stride(B,2)), info, 1)
+    LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[]))
+    B
+end
+
+function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float32}, ipiv::AbstractVector{Cint}, B::AbstractVecOrMat{<:Float32}; info = Ref{Cint}())
+    require_one_based_indexing(A, ipiv, B)
+    LinearAlgebra.LAPACK.chktrans(trans)
+    chkstride1(A, B, ipiv)
+    n = LinearAlgebra.checksquare(A)
+    if n != size(B, 1)
+        throw(DimensionMismatch("B has leading dimension $(size(B,1)), but needs $n"))
+    end
+    if n != length(ipiv)
+        throw(DimensionMismatch("ipiv has length $(length(ipiv)), but needs to be $n"))
+    end
+    nrhs = size(B, 2)
+    ccall(("sgetrs_", MKL_jll.libmkl_rt), Cvoid,
+          (Ref{UInt8}, Ref{Cint}, Ref{Cint}, Ptr{Float32}, Ref{Cint},
+           Ptr{Cint}, Ptr{Float32}, Ref{Cint}, Ptr{Cint}, Clong),
+          trans, n, size(B,2), A, max(1,stride(A,2)), ipiv, B, max(1,stride(B,2)), info, 1)
+    LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[]))
+    B
+end
+
 default_alias_A(::MKLLUFactorization, ::Any, ::Any) = false
 default_alias_b(::MKLLUFactorization, ::Any, ::Any) = false
 
@@ -47,8 +104,20 @@
         cache.cacheval = fact
         cache.isfresh = false
     end
-    y = ldiv!(cache.u, @get_cacheval(cache, :MKLLUFactorization)[1], cache.b)
-    SciMLBase.build_linear_solution(alg, y, nothing, cache)
+
+    A, info = @get_cacheval(cache, :MKLLUFactorization)
+    LinearAlgebra.require_one_based_indexing(cache.u, cache.b)
+    m, n = size(A, 1), size(A, 2)
+    if m > n
+        Bc = copy(cache.b)
+        getrs!('N', A.factors, A.ipiv, Bc; info)
+        return copyto!(cache.u, 1, Bc, 1, n)
+    else
+        copyto!(cache.u, cache.b)
+        getrs!('N', A.factors, A.ipiv, cache.u; info)
+    end
+
+    SciMLBase.build_linear_solution(alg, cache.u, nothing, cache)
 end
 
 end