Skip to content
This repository has been archived by the owner on Nov 4, 2024. It is now read-only.

Commit

Permalink
Merge pull request #53 from LuxDL/ap/fused_dense
Browse files Browse the repository at this point in the history
Fused Operations
  • Loading branch information
avik-pal authored Apr 24, 2024
2 parents 867d4b5 + 6a08a48 commit 3d41215
Show file tree
Hide file tree
Showing 34 changed files with 1,075 additions and 184 deletions.
24 changes: 19 additions & 5 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ steps:
# CUDA Tests
- group: ":julia: CUDA GPU"
steps:
- label: ":julia: Julia {{matrix.julia}} + CUDA GPU"
- label: ":julia: Julia {{matrix.julia}} + {{matrix.test_group}} + CUDA GPU"
plugins:
- JuliaCI/julia#v1:
version: "{{matrix.julia}}"
Expand All @@ -17,13 +17,19 @@ steps:
queue: "juliagpu"
cuda: "*"
env:
GROUP: "CUDA"
BACKEND_GROUP: "CUDA"
LUXLIB_TEST_GROUP: "{{matrix.test_group}}"
if: build.message !~ /\[skip tests\]/
timeout_in_minutes: 60
matrix:
setup:
julia:
- "1"
test_group:
- "normalization"
- "common_ops"
- "others"
- "normalization_sp"

# Downstream CUDA Tests
- group: ":telescope: Downstream CUDA"
Expand Down Expand Up @@ -63,6 +69,7 @@ steps:
queue: "juliagpu"
cuda: "*"
env:
BACKEND_GROUP: "CUDA"
GROUP: "CUDA"
DOWNSTREAM_TEST_REPO: "{{matrix.repo}}"
if: build.message !~ /\[skip tests\]/ || build.message !~ /\[skip downstream\]/
Expand All @@ -78,7 +85,7 @@ steps:
# AMDGPU Tests
- group: ":julia: AMD GPU"
steps:
- label: ":julia: Julia: {{matrix.julia}} + AMD GPU"
- label: ":julia: Julia: {{matrix.julia}} + {{matrix.test_group}} + AMD GPU"
plugins:
- JuliaCI/julia#v1:
version: "{{matrix.julia}}"
Expand All @@ -93,7 +100,8 @@ steps:
JULIA_AMDGPU_CORE_MUST_LOAD: "1"
JULIA_AMDGPU_HIP_MUST_LOAD: "1"
JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"
GROUP: "AMDGPU"
BACKEND_GROUP: "AMDGPU"
LUXLIB_TEST_GROUP: "{{matrix.test_group}}"
agents:
queue: "juliagpu"
rocm: "*"
Expand All @@ -104,6 +112,11 @@ steps:
setup:
julia:
- "1"
test_group:
- "normalization"
- "common_ops"
- "others"
- "normalization_sp"

# Downstream AMDGPU Tests
- group: ":telescope: Downstream AMD GPU"
Expand Down Expand Up @@ -145,6 +158,7 @@ steps:
rocmgpu: "*"
env:
GROUP: "AMDGPU"
BACKEND_GROUP: "AMDGPU"
JULIA_AMDGPU_CORE_MUST_LOAD: "1"
JULIA_AMDGPU_HIP_MUST_LOAD: "1"
JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"
Expand All @@ -160,6 +174,6 @@ steps:
- "Boltz"

env:
RETESTITEMS_NWORKERS: 4
RETESTITEMS_NWORKERS: 2
RETESTITEMS_NWORKER_THREADS: 2
SECRET_CODECOV_TOKEN: "wMpDLaAVEHe6EJAc+LZBl4jF3wADVN6F+15vr/ONJHOv/XXbtYovuc1PCQwhz0AzZjWpSO12IDTyKfwVgYvqaGYfQ9yGyplJtSu2MiL2k44B/IY+wEZhsfkBIhXlG89si5A/I+/f8T8QuwxBqBLh8fYq7oxC+gNzKhbj8vIT4n5hCusvYYGufgKRC2U9P4ij0Sf40egQ5B+StaTykqJNq1163UARjNBypHIVDbYE0HUHiF7WB4eI5LxBBzlcHmsUkuGp6ZlqAu/8C83k65lwDnyHDfjvBM24q9GQTDFA5r7RUfYKHElQEBPk3GhoJn7XGIfD2pC0VNcw5jYCwsX2mw==;U2FsdGVkX1+euKMib66zno5Kkw7OxXo6v4RnkAA/HElJM46qfX17VgZ9iVLg45jOOWRgghmyYuy2WQ8RcVbuOg=="
8 changes: 7 additions & 1 deletion .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ jobs:
matrix:
version:
- "1"
test_group:
- "normalization"
- "common_ops"
- "others"
- "normalization_sp"
steps:
- uses: actions/checkout@v4
- uses: julia-actions/setup-julia@v2
Expand All @@ -37,7 +42,8 @@ jobs:
- uses: julia-actions/julia-buildpkg@v1
- uses: julia-actions/julia-runtest@v1
env:
GROUP: "CPU"
BACKEND_GROUP: "CPU"
LUXLIB_TEST_GROUP: ${{ matrix.test_group }}
RETESTITEMS_NWORKERS: 4
RETESTITEMS_NWORKER_THREADS: 2
- uses: julia-actions/julia-processcoverage@v1
Expand Down
5 changes: 4 additions & 1 deletion .github/workflows/Downgrade.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@ jobs:
test:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
version: ['1.10']
test_group: ['normalization', 'common_ops', 'others', 'normalization_sp']
steps:
- uses: actions/checkout@v4
- uses: julia-actions/setup-julia@v2
Expand All @@ -27,7 +29,8 @@ jobs:
- uses: julia-actions/julia-buildpkg@v1
- uses: julia-actions/julia-runtest@v1
env:
GROUP: "CPU"
BACKEND_GROUP: "CPU"
LUXLIB_TEST_GROUP: ${{ matrix.test_group }}
RETESTITEMS_NWORKERS: 4
RETESTITEMS_NWORKER_THREADS: 2
- uses: julia-actions/julia-processcoverage@v1
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/Downstream.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ jobs:
env:
RETESTITEMS_NWORKERS: 4
RETESTITEMS_NWORKER_THREADS: 2
BACKEND_GROUP: ${{ matrix.package.group }}
- uses: julia-actions/julia-processcoverage@v1
with:
directories: src,ext
Expand Down
15 changes: 13 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
name = "LuxLib"
uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
authors = ["Avik Pal <[email protected]> and contributors"]
version = "0.3.14"
version = "0.3.15"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
FastBroadcast = "7034ab61-46d4-4ed7-9d0f-46aef9175898"
FastClosures = "9aa1b823-49e4-5ca5-8b0f-3971ec8bab6a"
GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623"
Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Strided = "5e0ebb24-38b0-5f93-81fe-25c709ecae67"

[weakdeps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
Expand All @@ -24,6 +29,7 @@ Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"

[extensions]
LuxLibAMDGPUExt = "AMDGPU"
LuxLibForwardDiffExt = "ForwardDiff"
LuxLibReverseDiffExt = "ReverseDiff"
LuxLibTrackerAMDGPUExt = ["AMDGPU", "Tracker"]
Expand All @@ -34,13 +40,17 @@ LuxLibcuDNNExt = ["CUDA", "cuDNN"]
[compat]
AMDGPU = "0.8.4"
Aqua = "0.8.7"
ArrayInterface = "7.9"
CUDA = "5.2"
ChainRulesCore = "1.23"
ComponentArrays = "0.15.8"
ExplicitImports = "1.4.1"
FastBroadcast = "0.2.8"
FastClosures = "0.3.2"
ForwardDiff = "0.10.36"
GPUArraysCore = "0.1.6"
KernelAbstractions = "0.9.15"
LinearAlgebra = "1.10"
LuxAMDGPU = "0.2.1"
LuxCUDA = "0.3.1"
LuxCore = "0.1.13"
Expand All @@ -49,11 +59,12 @@ Markdown = "1.10"
NNlib = "0.9.10"
PrecompileTools = "1.2"
Random = "1.10"
ReTestItems = "1"
ReTestItems = "1.23.1"
Reexport = "1"
ReverseDiff = "1.15"
StableRNGs = "1"
Statistics = "1.10"
Strided = "2"
Test = "1.10"
Tracker = "0.2.34"
Zygote = "0.6.69"
Expand Down
59 changes: 59 additions & 0 deletions ext/LuxLibAMDGPUExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
module LuxLibAMDGPUExt

using LuxLib: LuxLib
using NNlib: NNlib
using AMDGPU: AMDGPU, ROCArray

const MIOPENFloat = Union{Float16, Float32}

# NNlib incorrectly defines some of the broadcasting rules. Probably this should be
# upstreamed to NNlib
@static if AMDGPU.functional(:MIOpen)
# Just define for dims = 6 , 7, 8 and hope no one uses it beyond that
for f in [NNlib.relu, NNlib.relu6, NNlib.softplus, NNlib.σ, Base.tanh], N in (6, 7, 8)
@eval function Base.materialize(bc::Broadcast.Broadcasted{
<:Any, <:Any, typeof($f), <:Tuple{ROCArray{<:MIOPENFloat, $N}}})
return copy(bc)
end
end
end

@inline function LuxLib.fused_conv_bias_activation(
σ::F, weight::ROCArray{Float64, N}, x::ROCArray{Float64, N},
b::ROCArray{Float64, N}, cdims::NNlib.ConvDims) where {F, N}
@warn "MIOpen doesn't support Float64 convolutions, type-casting everything to Float32 \
to avoid runtime errors" maxlog=1
return LuxLib._oftype_array(Float64,
LuxLib.fused_conv_bias_activation(
σ, LuxLib._oftype_array(Float32, weight), LuxLib._oftype_array(Float32, x),
LuxLib._oftype_array(Float32, b), cdims))
end

@inline function LuxLib.fused_conv_bias_activation(
σ::F, weight::ROCArray{Float64, N}, x::ROCArray{Float64, N},
b::Nothing, cdims::NNlib.ConvDims) where {F, N}
@warn "MIOpen doesn't support Float64 convolutions, type-casting everything to Float32 \
to avoid runtime errors" maxlog=1
return LuxLib._oftype_array(Float64,
LuxLib.fused_conv_bias_activation(σ, LuxLib._oftype_array(Float32, weight),
LuxLib._oftype_array(Float32, x), b, cdims))
end

@inline function LuxLib.__generic_conv_bias_activation(
act::F, weight::ROCArray{Float64, N}, x::ROCArray{Float64, N},
bias::ROCArray{Float64, N}, cdims::NNlib.ConvDims) where {N, F}
return LuxLib._oftype_array(Float64,
LuxLib.__generic_conv_bias_activation(
act, LuxLib._oftype_array(Float32, weight), LuxLib._oftype_array(Float32, x),
LuxLib._oftype_array(Float32, bias), cdims))
end

@inline function LuxLib.__generic_conv_bias_activation(
act::F, weight::ROCArray{Float64, N}, x::ROCArray{Float64, N},
bias::Nothing, cdims::NNlib.ConvDims) where {N, F}
return LuxLib._oftype_array(Float64,
LuxLib.__generic_conv_bias_activation(act, LuxLib._oftype_array(Float32, weight),
LuxLib._oftype_array(Float32, x), bias, cdims))
end

end
20 changes: 19 additions & 1 deletion ext/LuxLibTrackerAMDGPUExt.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
module LuxLibTrackerAMDGPUExt

using AMDGPU: AMDGPU
using NNlib: NNlib, PoolDims
using LuxLib: LuxLib
using NNlib: NNlib, ConvDims, PoolDims
using Tracker: Tracker, TrackedArray

const ROCTrackedArray{T, N} = TrackedArray{T, N, <:AMDGPU.ROCArray{T, N}}
Expand Down Expand Up @@ -55,4 +56,21 @@ for poolname in (:maxpool, :meanpool)
end
end

@inline function LuxLib.__generic_conv_bias_activation(
act::F, weight::ROCTrackedArray{Float64, N}, x::ROCTrackedArray{Float64, N},
bias::ROCTrackedArray{Float64, N}, cdims::ConvDims) where {N, F}
return LuxLib._oftype_array(Float64,
LuxLib.__generic_conv_bias_activation(
act, LuxLib._oftype_array(Float32, weight), LuxLib._oftype_array(Float32, x),
LuxLib._oftype_array(Float32, bias), cdims))
end

@inline function LuxLib.__generic_conv_bias_activation(
act::F, weight::ROCTrackedArray{Float64, N}, x::ROCTrackedArray{Float64, N},
bias::Nothing, cdims::ConvDims) where {N, F}
return LuxLib._oftype_array(Float64,
LuxLib.__generic_conv_bias_activation(act, LuxLib._oftype_array(Float32, weight),
LuxLib._oftype_array(Float32, x), bias, cdims))
end

end
5 changes: 2 additions & 3 deletions ext/LuxLibTrackerExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,8 @@ for T1 in (:TrackedArray, :AbstractArray),

LuxLib.__is_tracked(T1, T2, T3) || continue

@eval Tracker.@grad_from_chainrules LuxLib.groupnorm(
x::$T1{<:Union{Float32, Float64}, 4}, scale::$T2{<:Union{Float32, Float64}},
bias::$T3{<:Union{Float32, Float64}}; groups::Int, epsilon::Real)
@eval Tracker.@grad_from_chainrules LuxLib.__fast_groupnorm(
x::$T1, groups, scale::$T2, bias::$T3, epsilon::Real)
end

end
6 changes: 3 additions & 3 deletions ext/LuxLibTrackercuDNNExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ const TR_BNParamType = Union{

function LuxLib.batchnorm(
x::TR_CUDNN_BN_ARRAY_TYPE, scale::TR_BNParamType, bias::TR_BNParamType,
running_mean::TR_BNParamType, running_var::TR_BNParamType;
momentum::Real, training::Val, epsilon::Real)
running_mean::TR_BNParamType, running_var::TR_BNParamType,
σ::F=identity; momentum::Real, training::Val, epsilon::Real) where {F}
rm, rv = LuxLib._get_batchnorm_statistics(x, running_mean, running_var, training)
# NOTE: The following returns a tracked tuple so we can't do `first` on it
x_ = LuxLib.batchnorm_cudnn(rm, rv, scale, bias, x, momentum, epsilon, training)[1]
return x_, (; running_mean=rm, running_var=rv)
return LuxLib.fast_activation!!(σ, x_), (; running_mean=rm, running_var=rv)
end

for RM in (:TrackedVector, :Nothing, :AbstractVector),
Expand Down
6 changes: 3 additions & 3 deletions ext/LuxLibcuDNNExt/LuxLibcuDNNExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ const CUDNN_BN_ARRAY_TYPE = Union{
const BNParamType = Union{Nothing, CuVector{<:Union{Float32, Float64}}}

function LuxLib.batchnorm(x::CUDNN_BN_ARRAY_TYPE, scale::BNParamType, bias::BNParamType,
running_mean::BNParamType, running_var::BNParamType;
momentum::Real, training::Val, epsilon::Real)
running_mean::BNParamType, running_var::BNParamType, σ::F=identity;
momentum::Real, training::Val, epsilon::Real) where {F}
rm, rv = LuxLib._get_batchnorm_statistics(x, running_mean, running_var, training)
x_ = first(LuxLib.batchnorm_cudnn(rm, rv, scale, bias, x, momentum, epsilon, training))
return x_, (; running_mean=rm, running_var=rv)
return LuxLib.fast_activation!!(σ, x_), (; running_mean=rm, running_var=rv)
end

@inline function LuxLib.batchnorm_cudnn(
Expand Down
13 changes: 13 additions & 0 deletions src/LuxLib.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,20 @@ module LuxLib
using PrecompileTools: @recompile_invalidations

@recompile_invalidations begin
using ArrayInterface: ArrayInterface
using ChainRulesCore: ChainRulesCore
using FastBroadcast: @..
using FastClosures: @closure
using GPUArraysCore: GPUArraysCore
using KernelAbstractions: KernelAbstractions, @Const, @index, @kernel
using LinearAlgebra: LinearAlgebra, BLAS, mul!
using LuxCore: LuxCore
using Markdown: @doc_str
using NNlib: NNlib
using Random: Random, AbstractRNG, rand!
using Reexport: @reexport
using Statistics: Statistics, mean, std, var
using Strided: Strided, @strided
end

@reexport using NNlib
Expand All @@ -24,14 +29,22 @@ include("utils.jl")
# Low-Level Implementations
include("impl/groupnorm.jl")
include("impl/normalization.jl")
include("impl/fused_dense.jl")
include("impl/fused_conv.jl")
include("impl/fast_activation.jl")

# User Facing
include("api/batchnorm.jl")
include("api/dropout.jl")
include("api/groupnorm.jl")
include("api/instancenorm.jl")
include("api/layernorm.jl")
include("api/dense.jl")
include("api/conv.jl")
include("api/fast_activation.jl")

export batchnorm, groupnorm, instancenorm, layernorm, alpha_dropout, dropout
export fused_dense_bias_activation, fused_conv_bias_activation
export fast_activation!!

end
Loading

2 comments on commit 3d41215

@avik-pal
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/105553

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.3.15 -m "<description of version>" 3d41215114a5200e81ae760a4deb1a635a4fcaf1
git push origin v0.3.15

Please sign in to comment.