Skip to content
This repository has been archived by the owner on Nov 4, 2024. It is now read-only.

refactor: move JuliaSIMD deps to extensions #175

Merged
merged 11 commits into from
Oct 18, 2024
25 changes: 24 additions & 1 deletion .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ concurrency:

jobs:
ci:
name: Julia ${{ matrix.version }} - ${{ matrix.test_group }} - ${{ matrix.os }} - ${{ matrix.blas_backend }}
name: Julia ${{ matrix.version }} - ${{ matrix.test_group }} - ${{ matrix.os }} - ${{ matrix.blas_backend }} - ${{ matrix.loopvec }}
if: ${{ !contains(github.event.head_commit.message, '[skip tests]') }}
runs-on: ${{ matrix.os }}
strategy:
Expand All @@ -43,27 +43,49 @@ jobs:
- "others"
blas_backend:
- "default"
loopvec:
- "true"
include:
- os: ubuntu-latest
test_group: "dense"
blas_backend: "blis"
version: "1.10"
loopvec: "true"
- os: ubuntu-latest
test_group: "dense"
blas_backend: "mkl"
version: "1.10"
loopvec: "true"
- os: ubuntu-latest
test_group: "dense"
blas_backend: "default"
version: "1.10"
loopvec: "false"
- os: ubuntu-latest
test_group: "batched_ops"
blas_backend: "default"
version: "1.10"
loopvec: "false"
- os: ubuntu-latest
test_group: "other_ops"
blas_backend: "default"
version: "1.10"
loopvec: "false"
- os: macos-latest
test_group: "dense"
blas_backend: "appleaccelerate"
version: "1.10"
loopvec: "true"
- os: macos-latest
test_group: "all"
blas_backend: "default"
version: "1.10"
loopvec: "true"
- os: windows-latest
test_group: "all"
blas_backend: "default"
version: "1.10"
loopvec: "true"
steps:
- uses: actions/checkout@v4
- uses: julia-actions/setup-julia@v2
Expand All @@ -84,6 +106,7 @@ jobs:
env:
LUXLIB_TEST_GROUP: ${{ matrix.test_group }}
LUXLIB_BLAS_BACKEND: ${{ matrix.blas_backend }}
LUXLIB_LOAD_LOOPVEC: ${{ matrix.loopvec }}
- uses: julia-actions/julia-processcoverage@v1
with:
directories: src,ext
Expand Down
13 changes: 9 additions & 4 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LuxLib"
uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
authors = ["Avik Pal <[email protected]> and contributors"]
version = "1.3.3"
version = "1.3.4"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
Expand All @@ -15,16 +15,14 @@ ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
Hwloc = "0e44f5e4-bd66-52a0-8798-143a42290a1d"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623"
MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
Preferences = "21216c6a-2e73-6563-6e65-726566657250"
Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
SLEEFPirates = "476501e8-09a2-5ece-8869-fb82de89a1fa"
Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Expand All @@ -36,7 +34,10 @@ BLISBLAS = "6f275bd8-fec0-4d39-945b-7e95a765fa1e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2"
Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
SLEEFPirates = "476501e8-09a2-5ece-8869-fb82de89a1fa"
Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"

Expand All @@ -46,7 +47,10 @@ LuxLibBLISBLASExt = "BLISBLAS"
LuxLibCUDAExt = "CUDA"
LuxLibMKLExt = "MKL"
LuxLibEnzymeExt = "Enzyme"
LuxLibLoopVectorizationExt = "LoopVectorization"
LuxLibOctavianExt = ["Octavian", "LoopVectorization"]
LuxLibReverseDiffExt = "ReverseDiff"
LuxLibSLEEFPiratesExt = "SLEEFPirates"
LuxLibTrackerAMDGPUExt = ["AMDGPU", "Tracker"]
LuxLibTrackerExt = "Tracker"
LuxLibcuDNNExt = ["CUDA", "cuDNN"]
Expand Down Expand Up @@ -75,6 +79,7 @@ MLDataDevices = "1.2"
Markdown = "1.10"
NNlib = "0.9.24"
Octavian = "0.3.28"
Preferences = "1.4.3"
Polyester = "0.7.15"
Random = "1.10"
Reexport = "1"
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/Project.toml
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
[deps]
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
LuxLib = "82251201-b29d-42c6-8e01-566dec8acb11"
MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
Expand Down
1 change: 1 addition & 0 deletions benchmarks/runbenchmarks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ using Pkg
using BenchmarkTools
using InteractiveUtils
using LinearAlgebra
using Octavian, LoopVectorization

const SUITE = BenchmarkGroup()
BenchmarkTools.DEFAULT_PARAMETERS.seconds = 5
Expand Down
72 changes: 72 additions & 0 deletions ext/LuxLibLoopVectorizationExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
module LuxLibLoopVectorizationExt

using LoopVectorization: LoopVectorization, @tturbo, @turbo, indices
using Polyester: @batch
using Static: True

using LuxLib: LuxLib, Utils

Utils.is_extension_loaded(::Val{:LoopVectorization}) = True()

Utils.can_loopvec_args_check(::True, args...) = LoopVectorization.check_args(args...)

# matmul
for serial in (true, false)
opname = serial ? :serial_matmul_loopvec! : :matmul_loopvec!
@eval @inline function LuxLib.Impl.$(opname)(
C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α::Number, β::Number)
if !iszero(β) # Secial case this because Base.FastMath.mul_fast(NaN, false) = NaN
@turbo thread=$(!serial) for K in indices((C, B), 2), J in indices((C, A), 1)
Cⱼₖ = zero(eltype(C))
for I in indices((A, B), (2, 1))
Cⱼₖ += A[J, I] * B[I, K]
end
C[J, K] = α * Cⱼₖ + β * C[J, K]
end
else
@turbo thread=$(!serial) for K in indices((C, B), 2), J in indices((C, A), 1)
Cⱼₖ = zero(eltype(C))
for I in indices((A, B), (2, 1))
Cⱼₖ += A[J, I] * B[I, K]
end
C[J, K] = α * Cⱼₖ
end
end
end
end

@inline function LuxLib.Impl.matmuladd_loopvec!(
C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, bias::AbstractVector)
@tturbo for K in indices((C, B), 2), J in indices((C, A), 1)
Cⱼₖ = zero(eltype(C))
for I in indices((A, B), (2, 1))
Cⱼₖ += A[J, I] * B[I, K]
end
C[J, K] = bias[J] + Cⱼₖ
end
return
end

# batched matmul
function LuxLib.Impl.batched_matmul_loopvec_impl!(
z::AbstractArray{zT, 3}, x::AbstractArray{xT, 3},
y::AbstractArray{yT, 3}, α::Number=true, β::Number=false) where {zT, xT, yT}
if size(x, 3) == size(y, 3)
@batch for L in axes(z, 3)
LuxLib.Impl.serial_matmul_loopvec!(
Utils.batchview(z, L), Utils.batchview(x, L), Utils.batchview(y, L), α, β)
end
elseif size(x, 3) == 1
@batch for L in axes(z, 3)
LuxLib.Impl.serial_matmul_loopvec!(
Utils.batchview(z, L), Utils.batchview(x, 1), Utils.batchview(y, L), α, β)
end
else # has to be size(y, 3) == 1
@batch for L in axes(z, 3)
LuxLib.Impl.serial_matmul_loopvec!(
Utils.batchview(z, L), Utils.batchview(x, L), Utils.batchview(y, 1), α, β)
end
end
end

end
16 changes: 16 additions & 0 deletions ext/LuxLibOctavianExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
module LuxLibOctavianExt

using Octavian: Octavian
using Static: True

using LuxLib: LuxLib, Utils

Utils.is_extension_loaded(::Val{:Octavian}) = True()

@inline function LuxLib.Impl.matmul_octavian!(
C::AbstractMatrix, A::AbstractMatrix, B::AbstractMatrix, α::Number, β::Number)
Octavian.matmul!(C, A, B, α, β)
return
end

end
58 changes: 58 additions & 0 deletions ext/LuxLibSLEEFPiratesExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
module LuxLibSLEEFPiratesExt

using ChainRulesCore: ChainRulesCore
using NNlib: NNlib
using SLEEFPirates: SLEEFPirates

using LuxLib: Numeric, Impl

const CRC = ChainRulesCore

sigmoid_fast(x::Number) = SLEEFPirates.sigmoid_fast(x)
softplus(x::Number) = SLEEFPirates.softplus(x)
logsigmoid(x::Number) = -softplus(-x)
swish(x::Number) = Base.FastMath.mul_fast(x, sigmoid_fast(x))
lisht(x::Number) = Base.FastMath.mul_fast(x, tanh_fast(x))
tanh(x::Number) = SLEEFPirates.tanh(x)
tanh_fast(x::Number) = SLEEFPirates.tanh_fast(x)

for (f, dfdx) in [
#! format: off
(:sigmoid_fast, :(conj(Base.FastMath.mul_fast(Ω, Base.FastMath.sub_fast(1, Ω))))),
(:softplus, :(sigmoid_fast(x))),
(:logsigmoid, :(sigmoid_fast(-x))),
(:swish, :(Base.FastMath.add_fast(Ω, Base.FastMath.mul_fast(sigmoid_fast(x), Base.FastMath.sub_fast(1, Ω))))),
(:lisht, :(Base.FastMath.add_fast(x, Base.FastMath.mul_fast(tanh_fast(x), Base.FastMath.sub_fast(1, Ω))))),
(:tanh, :(conj(Base.FastMath.sub_fast(1, Base.FastMath.mul_fast(Ω, Ω))))),
(:tanh_fast, :(conj(Base.FastMath.sub_fast(1, Base.FastMath.mul_fast(Ω, Ω)))))
#! format: on
]
@eval CRC.@scalar_rule($f(x), $(dfdx))

∇f = Symbol(:∇broadcasted_, f)
@eval function CRC.rrule(::typeof(Broadcast.broadcasted), ::typeof($f),
x::Union{Numeric, Broadcast.Broadcasted})
Ω = $(f).(x)
function $(∇f)(dΩ)
∂x = CRC.InplaceableThunk(dx -> @.(dx+=dΩ * $(dfdx)), CRC.@thunk @.(dΩ*$(dfdx)))
return CRC.NoTangent(), CRC.NoTangent(), ∂x
end
return Ω, $(∇f)
end
end

for (fbase, ffast) in [
#! format: off
(NNlib.sigmoid_fast, sigmoid_fast),
(NNlib.softplus, softplus),
(NNlib.logsigmoid, logsigmoid),
(NNlib.swish, swish),
(NNlib.lisht, lisht),
(Base.tanh, tanh),
(NNlib.tanh_fast, tanh_fast)
#! format: on
]
@eval Impl.sleefpirates_fast_act(::typeof($fbase)) = $ffast
end

end
3 changes: 3 additions & 0 deletions src/LuxLib.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
module LuxLib

using Compat: @compat
using Preferences: @load_preference
using Reexport: @reexport
using Static: Static, known

Expand All @@ -15,6 +16,8 @@ const Numeric = Union{AbstractArray{<:T}, T} where {T <: Number}
const ∂∅ = NoTangent()
const CRC = ChainRulesCore

const DISABLE_LOOP_VECTORIZATION = @load_preference("disable_loop_vectorization", false)

include("utils.jl")
include("traits.jl")
include("impl/Impl.jl")
Expand Down
2 changes: 1 addition & 1 deletion src/api/activation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ generic implementation.
This function doesn't replace `σ` with `NNlib.fast_act(σ, ...)`, that needs to be
done by the user if needed.

!!! tip
!!! tip "Load `SLEEFPirates.jl` to get faster activations"

Certain activation functions are replaced with specialized implementations from
[SLEEFPirates.jl](https://github.com/JuliaSIMD/SLEEFPirates.jl) for FP32. This might
Expand Down
5 changes: 5 additions & 0 deletions src/api/batched_mul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@
Computes the batched matrix multiplication of `x` and `y`. For more details see the NNlib
documentation on `NNlib.batched_mul`. This function is mostly a wrapper around `batched_mul`
but attempts to be faster on CPUs.

!!! tip "Load `LoopVectorization.jl` to get faster batched matrix multiplication"

On CPUs loading LoopVectorization adds faster implementations of batched matrix
multiplication.
"""
function batched_matmul(x::AbstractMatrix, y::AbstractArray{yT, 3}) where {yT}
return batched_matmul(expand_batchdim(x), y)
Expand Down
5 changes: 5 additions & 0 deletions src/api/dense.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ multiple operations.
- For small CPU Arrays, we use LoopVectorization.jl. On `x86_64` we use Octavian for
medium sized matrices. This is overridden if special BLAS implementations are loaded
(currently `MKL`, `AppleAccelerate`, and `BLISBLAS`).

!!! tip "Load `Octavian.jl`

Loading `Octavian.jl` enables a polyalgorithm that uses different backends based on the
input sizes.
"""
function fused_dense_bias_activation(σ::F, weight::AbstractMatrix, x::AbstractMatrix,
b::Optional{<:AbstractVector}) where {F}
Expand Down
5 changes: 1 addition & 4 deletions src/impl/Impl.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@ using ForwardDiff: ForwardDiff

using KernelAbstractions: KernelAbstractions, @kernel, @Const, @index

using LoopVectorization: LoopVectorization, @turbo, @tturbo, indices
using Octavian: Octavian
using Polyester: @batch

using LinearAlgebra: LinearAlgebra, mul!
Expand All @@ -31,15 +29,14 @@ using ..Utils: Utils, NotaNumber, batchview, concrete_bias_act_output_eltype, co
copy_drop_gradients, eltype_mismatch, expand_batchdim,
maybe_reduce_BLAS_threads, ofeltype_array, only_derivative, remove_tracking,
reset_BLAS_threads, run_ka_kernel, safe_eltype, safe_vec, safe_warning,
unsafe_known, unrolled_mapreduce, @enzyme_alternative
unsafe_known, unrolled_mapreduce, can_loopvec_args, @enzyme_alternative
using ..Traits: activation_intermediate_not_needed, activation_has_rrule, is_mutable_array,
fuse_cpu_activation
using ..System: explicit_blas_loaded, use_octavian, fits_in_l1cache, fits_in_l2cache,
fits_in_l3cache

const CRC = ChainRulesCore
const KA = KernelAbstractions
const LV = LoopVectorization

include("activation.jl")
include("batched_mul.jl")
Expand Down
Loading
Loading