From ab54644b54e9e7ebeae8704c7fc51a3899380049 Mon Sep 17 00:00:00 2001 From: Avik Pal Date: Mon, 27 Mar 2023 22:05:22 -0400 Subject: [PATCH] Integrate LuxTestUtils --- test/LocalPreferences.toml | 2 + test/Project.toml | 4 +- test/api/batchnorm.jl | 12 ++-- test/api/dropout.jl | 35 ++++++----- test/api/groupnorm.jl | 25 ++++---- test/api/instancenorm.jl | 18 ++---- test/api/layernorm.jl | 17 +++-- test/ext/LuxLibForwardDiffExt.jl | 2 +- test/test_utils.jl | 105 +------------------------------ 9 files changed, 58 insertions(+), 162 deletions(-) create mode 100644 test/LocalPreferences.toml diff --git a/test/LocalPreferences.toml b/test/LocalPreferences.toml new file mode 100644 index 00000000..1e3d8dda --- /dev/null +++ b/test/LocalPreferences.toml @@ -0,0 +1,2 @@ +[LuxTestUtils] +target_modules = ["LuxLib"] diff --git a/test/Project.toml b/test/Project.toml index 703b30c7..9341e347 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,14 +1,12 @@ [deps] -FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b" LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda" +LuxTestUtils = "ac9de150-d08f-4546-94fb-7472b5760531" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267" SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" -Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [compat] diff --git a/test/api/batchnorm.jl b/test/api/batchnorm.jl index b9302506..a3211f98 100644 --- a/test/api/batchnorm.jl +++ b/test/api/batchnorm.jl @@ -34,7 +34,8 @@ end y, nt = batchnorm(x, scale, bias, rm, rv; epsilon, training, momentum=T(0.9)) @inferred batchnorm(x, scale, bias, rm, rv; epsilon, training, momentum=T(0.9)) - run_JET_tests(_f, x, scale, bias, rm, rv) + + @jet _f(x, scale, bias, rm, rv) @test y isa aType{T, length(sz)} @test size(y) == sz @@ -45,17 +46,16 @@ end end if __istraining(training) + fp16 = T == Float16 if affine __f = (args...) -> sum(first(batchnorm(args..., rm, rv; epsilon, training, momentum=T(0.9)))) - test_gradient_correctness(__f, x, scale, bias; gpu_testing=on_gpu, - skip_fdm=T == Float16, atol=1.0f-2, rtol=1.0f-2, - soft_fail=T == Float16) + @eval @test_gradients $__f $x $scale $bias gpu_testing=$on_gpu soft_fail=$fp16 atol=1.0f-2 rtol=1.0f-2 else __f = (args...) -> sum(first(batchnorm(args..., scale, bias, rm, rv; epsilon, training, momentum=T(0.9)))) - test_gradient_correctness(__f, x; gpu_testing=on_gpu, skip_fdm=T == Float16, - atol=1.0f-2, rtol=1.0f-2, soft_fail=T == Float16) + + @eval @test_gradients $__f $x gpu_testing=$on_gpu soft_fail=$fp16 atol=1.0f-2 rtol=1.0f-2 end end end diff --git a/test/api/dropout.jl b/test/api/dropout.jl index 5b473cf9..659c71ca 100644 --- a/test/api/dropout.jl +++ b/test/api/dropout.jl @@ -22,9 +22,10 @@ rng = MersenneTwister(0) @test rng != rng_ __f = x -> sum(first(dropout(rng, x, T(0.5), Val(true); dims=Colon()))) - test_gradient_correctness(__f, x; gpu_testing=on_gpu, atol=1.0f-2, rtol=1.0f-2, - soft_fail=T == Float16) - run_JET_tests(__f, x) + + fp16 = T == Float16 + @eval @test_gradients $__f $x atol=1.0f-2 rtol=1.0f-2 soft_fail=$fp16 gpu_testing=$on_gpu + @jet __f(x) @inferred dropout(rng, x, T(0.5), Val(true); dims=Colon()) @@ -58,9 +59,10 @@ end end __f = x -> sum(first(dropout(rng, x, mask, T(0.5), Val(true), Val(true); dims=Colon()))) - test_gradient_correctness(__f, x; gpu_testing=on_gpu, atol=1.0f-2, rtol=1.0f-2, - soft_fail=T == Float16) - run_JET_tests(__f, x) + + fp16 = T == Float16 + @eval @test_gradients $__f $x atol=1.0f-2 rtol=1.0f-2 soft_fail=$fp16 gpu_testing=$on_gpu + @jet __f(x) # Try using mask if possible (possible!!) @inferred dropout(rng, x, mask, T(0.5), Val(true), Val(false); dims=Colon()) @@ -76,9 +78,10 @@ end end __f = x -> sum(first(dropout(rng, x, mask, T(0.5), Val(true), Val(false); dims=Colon()))) - test_gradient_correctness(__f, x; gpu_testing=on_gpu, atol=1.0f-2, rtol=1.0f-2, - soft_fail=T == Float16) - run_JET_tests(__f, x) + + fp16 = T == Float16 + @eval @test_gradients $__f $x atol=1.0f-2 rtol=1.0f-2 soft_fail=$fp16 gpu_testing=$on_gpu + @jet __f(x) mask = rand(T, (x_shape[1:(end - 1)]..., 13)) |> aType @@ -96,9 +99,10 @@ end end __f = x -> sum(first(dropout(rng, x, mask, T(0.5), Val(true), Val(false); dims=Colon()))) - test_gradient_correctness(__f, x; gpu_testing=on_gpu, atol=1.0f-2, rtol=1.0f-2, - soft_fail=T == Float16) - run_JET_tests(__f, x) + + fp16 = T == Float16 + @eval @test_gradients $__f $x atol=1.0f-2 rtol=1.0f-2 soft_fail=$fp16 gpu_testing=$on_gpu + @jet __f(x) # Testing Mode @inferred dropout(rng, x, mask, T(0.5), Val(false), Val(false); dims=Colon()) @@ -129,9 +133,10 @@ end end @test_broken isapprox(std(y), std(x); atol=1.0f-2, rtol=1.0f-2) __f = x -> sum(first(alpha_dropout(rng, x, T(0.5), Val(true)))) - test_gradient_correctness(__f, x; gpu_testing=on_gpu, atol=1.0f-2, rtol=1.0f-2, - soft_fail=T == Float16) - run_JET_tests(__f, x) + + fp16 = T == Float16 + @eval @test_gradients $__f $x atol=1.0f-2 rtol=1.0f-2 soft_fail=$fp16 gpu_testing=$on_gpu + @jet __f(x) @inferred alpha_dropout(rng, x, T(0.5), Val(false)) diff --git a/test/api/groupnorm.jl b/test/api/groupnorm.jl index 35a8cd3f..1c27ddca 100644 --- a/test/api/groupnorm.jl +++ b/test/api/groupnorm.jl @@ -45,7 +45,7 @@ end bias) @inferred groupnorm(x, scale, bias; groups, epsilon) - run_JET_tests(_f, x, scale, bias; opt_broken=true) + @jet _f(x, scale, bias) opt_broken=true @test y isa aType{T, 4} @test size(y) == sz @@ -60,14 +60,14 @@ end # The KA implementation reorders operations manually for maximal # performance. Hence equality cannot be guaranteed. - @test isapprox(y, y_; atol=1.0f-3, rtol=1.0f-3) - @test isapprox(gs_x, gs_x_; atol=1.0f-3, rtol=1.0f-3) - @test isapprox(gs_scale, gs_scale_; atol=1.0f-3, rtol=1.0f-3) - @test isapprox(gs_bias, gs_bias_; atol=1.0f-3, rtol=1.0f-3) - - test_gradient_correctness((args...) -> sum(_f(args...)), x, scale, bias; - gpu_testing=on_gpu, atol=1.0f-3, rtol=1.0f-3, - soft_fail=T == Float16) + @test check_approx(y, y_; atol=1.0f-3, rtol=1.0f-3) + @test check_approx(gs_x, gs_x_; atol=1.0f-3, rtol=1.0f-3) + @test check_approx(gs_scale, gs_scale_; atol=1.0f-3, rtol=1.0f-3) + @test check_approx(gs_bias, gs_bias_; atol=1.0f-3, rtol=1.0f-3) + + fp16 = T == Float16 + __f = sum ∘ _f + @eval @test_gradients $__f $x $scale $bias gpu_testing=$on_gpu atol=1.0f-3 rtol=1.0f-3 soft_fail=$fp16 end end end @@ -85,17 +85,16 @@ end end @inferred groupnorm(x, scale, bias, rm, rv; groups, epsilon, training, momentum=T(0.9)) - run_JET_tests(_f, x, scale, bias, rm, rv; opt_broken=true) + @jet _f(x, scale, bias, rm, rv) opt_broken=true @test y isa aType{T, 4} @test size(y) == sz @test size(nt.running_mean) == (groups,) @test size(nt.running_var) == (groups,) + fp16 = T == Float16 __f = (args...) -> sum(first(groupnorm(args..., rm, rv; groups, epsilon, training, momentum=T(0.9)))) - test_gradient_correctness(__f, x, scale, bias; gpu_testing=on_gpu, - skip_fdm=T == Float16, atol=1.0f-2, rtol=1.0f-2, - soft_fail=T == Float16) + @eval @test_gradients $__f $x $scale $bias gpu_testing=$on_gpu atol=1.0f-2 rtol=1.0f-2 soft_fail=$fp16 end end end diff --git a/test/api/instancenorm.jl b/test/api/instancenorm.jl index 5c543f7e..5d067645 100644 --- a/test/api/instancenorm.jl +++ b/test/api/instancenorm.jl @@ -26,30 +26,24 @@ end y, nt = instancenorm(x, scale, bias; epsilon, training) @inferred instancenorm(x, scale, bias; epsilon, training) - run_JET_tests(_f, x, scale, bias) + @jet _f(x, scale, bias) @test y isa aType{T, length(sz)} @test size(y) == sz _target_std = ones(ntuple(_ -> 1, length(sz) - 2)..., size(x)[(end - 1):end]...) - if length(sz) != 3 - @test isapprox(std(Array(y); dims=1:(length(sz) - 2)), _target_std; atol=0.2) - else - @test_broken isapprox(std(Array(y); dims=1:(length(sz) - 2)), _target_std; - atol=0.2) - end + @eval @test check_approx(std(Array($y); dims=1:($(length(sz) - 2))), $_target_std; + atol=0.2, rtol=0.2) @test std(y; dims=1:(length(sz) - 2)) != std(x; dims=1:(length(sz) - 2)) if __istraining(training) + fp16 = T == Float16 if affine __f = (args...) -> sum(first(instancenorm(args...; epsilon, training))) - test_gradient_correctness(__f, x, scale, bias; gpu_testing=on_gpu, - skip_fdm=T == Float16, atol=1.0f-2, rtol=1.0f-2, - soft_fail=T == Float16) + @eval @test_gradients $__f $x $scale $bias soft_fail=$fp16 atol=1.0f-2 rtol=1.0f-2 gpu_testing=$on_gpu else __f = (args...) -> sum(first(instancenorm(args..., scale, bias; epsilon, training))) - test_gradient_correctness(__f, x; gpu_testing=on_gpu, skip_fdm=T == Float16, - atol=1.0f-2, rtol=1.0f-2, soft_fail=T == Float16) + @eval @test_gradients $__f $x soft_fail=$fp16 atol=1.0f-2 rtol=1.0f-2 gpu_testing=$on_gpu end end end diff --git a/test/api/layernorm.jl b/test/api/layernorm.jl index 9fdf3f9a..a91681db 100644 --- a/test/api/layernorm.jl +++ b/test/api/layernorm.jl @@ -26,7 +26,7 @@ end x, scale, bias = _setup_layernorm(aType, T, x_shape, affine_shape) @inferred _f(x, scale, bias) - run_JET_tests(_f, x, scale, bias) + @jet _f(x, scale, bias) y = _f(x, scale, bias) @@ -34,18 +34,17 @@ end @test size(y) == x_shape if affine_shape === nothing - @test isapprox(mean(y; dims), 0; atol=1e-3, rtol=1e-3) - @test isapprox(std(y; dims), 1; atol=1e-1, rtol=1e-1) + @test check_approx(mean(y; dims), 0; atol=1e-3, rtol=1e-3) + @test check_approx(std(y; dims), 1; atol=1e-1, rtol=1e-1) end + fp16 = T == Float16 if affine_shape === nothing - test_gradient_correctness(x -> sum(_f(x, nothing, nothing)), x; - skip_fdm=T == Float16, gpu_testing=on_gpu, - atol=1.0f-2, rtol=1.0f-2, soft_fail=T == Float16) + __f = x -> sum(_f(x, nothing, nothing)) + @eval @test_gradients $__f $x soft_fail=$fp16 atol=1.0f-2 rtol=1.0f-2 gpu_testing=$on_gpu else - test_gradient_correctness(sum ∘ _f, x, scale, bias; skip_fdm=T == Float16, - gpu_testing=on_gpu, atol=1.0f-2, rtol=1.0f-2, - soft_fail=T == Float16) + __f = sum ∘ _f + @eval @test_gradients $__f $x $scale $bias soft_fail=$fp16 atol=1.0f-2 rtol=1.0f-2 gpu_testing=$on_gpu end end end end diff --git a/test/ext/LuxLibForwardDiffExt.jl b/test/ext/LuxLibForwardDiffExt.jl index 458df160..a72d7c14 100644 --- a/test/ext/LuxLibForwardDiffExt.jl +++ b/test/ext/LuxLibForwardDiffExt.jl @@ -13,5 +13,5 @@ rng = MersenneTwister(0) x_dropout = dropout(rng, x, 0.5f0, Val(true); dims=:)[1] x_dual_dropout = ForwardDiff.value.(dropout(rng, x_dual, 0.5f0, Val(true); dims=:)[1]) - @test isapprox(x_dropout, x_dual_dropout) + @test check_approx(x_dropout, x_dual_dropout) end end diff --git a/test/test_utils.jl b/test/test_utils.jl index dceac9a5..2ff879e5 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -1,6 +1,6 @@ -using FiniteDifferences, LuxLib, Test +using LuxLib, LuxTestUtils, Test, Zygote using LuxCUDA # CUDA Support -using ReverseDiff, Tracker, Zygote # AD Packages +using LuxTestUtils: @jet, @test_gradients, check_approx const GROUP = get(ENV, "GROUP", "All") @@ -23,105 +23,4 @@ const MODES = begin end end -try - using JET -catch - @warn "JET not not precompiling. All JET tests will be skipped." maxlog=1 - global test_call(args...; kwargs...) = nothing - global test_opt(args...; kwargs...) = nothing -end - -function Base.isapprox(x, y; kwargs...) - @warn "`isapprox` is not defined for ($(typeof(x)), $(typeof(y))). Using `==` instead." - return x == y -end - -function Base.isapprox(x::Tuple, y::Tuple; kwargs...) - return all(isapprox.(x, y; kwargs...)) -end - -function Base.isapprox(nt1::NamedTuple{fields}, nt2::NamedTuple{fields}; - kwargs...) where {fields} - checkapprox(xy) = isapprox(xy[1], xy[2]; kwargs...) - checkapprox(t::Tuple{Nothing, Nothing}) = true - return all(checkapprox, zip(values(nt1), values(nt2))) -end - -function Base.isapprox(t1::NTuple{N, T}, t2::NTuple{N, T}; kwargs...) where {N, T} - checkapprox(xy) = isapprox(xy[1], xy[2]; kwargs...) - checkapprox(t::Tuple{Nothing, Nothing}) = true - return all(checkapprox, zip(t1, t2)) -end - -Base.isapprox(::Nothing, v::AbstractArray; kwargs...) = length(v) == 0 -Base.isapprox(v::AbstractArray, ::Nothing; kwargs...) = length(v) == 0 -Base.isapprox(v::NamedTuple, ::Nothing; kwargs...) = length(v) == 0 -Base.isapprox(::Nothing, v::NamedTuple; kwargs...) = length(v) == 0 -Base.isapprox(v::Tuple, ::Nothing; kwargs...) = length(v) == 0 -Base.isapprox(::Nothing, v::Tuple; kwargs...) = length(v) == 0 -Base.isapprox(x::AbstractArray, y::NamedTuple; kwargs...) = length(x) == 0 && length(y) == 0 -Base.isapprox(x::NamedTuple, y::AbstractArray; kwargs...) = length(x) == 0 && length(y) == 0 -Base.isapprox(x::AbstractArray, y::Tuple; kwargs...) = length(x) == 0 && length(y) == 0 -Base.isapprox(x::Tuple, y::AbstractArray; kwargs...) = length(x) == 0 && length(y) == 0 - -# JET Tests -function run_JET_tests(f, args...; call_broken=false, opt_broken=false, kwargs...) - @static if VERSION >= v"1.7" - test_call(f, typeof.(args); broken=call_broken, target_modules=(LuxLib,)) - test_opt(f, typeof.(args); broken=opt_broken, target_modules=(LuxLib,)) - end -end - __istraining(::Val{training}) where {training} = training - -# Test the gradients across AD Frameworks and FiniteDifferences -# TODO: Implement it as a macro so that we get correct line numbers for `@test` failures. -function test_gradient_correctness(f::Function, args...; gpu_testing::Bool=false, - skip_fdm::Bool=false, skip_fdm_override::Bool=false, - soft_fail::Bool=false, kwargs...) - gs_ad_zygote = Zygote.gradient(f, args...) - gs_ad_tracker = Tracker.gradient(f, args...) - gs_ad_reversediff = gpu_testing ? nothing : ReverseDiff.gradient(f, args) - - if !skip_fdm_override - arr_len = length.(args) - if any(x -> x >= 25, arr_len) || sum(arr_len) >= 100 - @warn "Skipping FiniteDifferences test for large arrays: $(arr_len)." - skip_fdm = true - end - end - - gs_fdm = gpu_testing || skip_fdm ? nothing : - FiniteDifferences.grad(FiniteDifferences.central_fdm(8, 1), f, args...) - for idx in 1:length(gs_ad_zygote) - _c1 = isapprox(Tracker.data(gs_ad_tracker[idx]), gs_ad_zygote[idx]; kwargs...) - if soft_fail && !_c1 - @test_broken isapprox(Tracker.data(gs_ad_tracker[idx]), gs_ad_zygote[idx]; - kwargs...) - else - @test isapprox(Tracker.data(gs_ad_tracker[idx]), gs_ad_zygote[idx]; kwargs...) - end - - if !gpu_testing - if !skip_fdm - _c2 = isapprox(gs_ad_zygote[idx], gs_fdm[idx]; kwargs...) - if soft_fail && !_c2 - @test_broken isapprox(gs_ad_zygote[idx], gs_fdm[idx]; kwargs...) - else - @test isapprox(gs_ad_zygote[idx], gs_fdm[idx]; kwargs...) - end - end - - _c3 = isapprox(ReverseDiff.value(gs_ad_reversediff[idx]), gs_ad_zygote[idx]; - kwargs...) - if soft_fail && !_c3 - @test_broken isapprox(ReverseDiff.value(gs_ad_reversediff[idx]), - gs_ad_zygote[idx]; kwargs...) - else - @test isapprox(ReverseDiff.value(gs_ad_reversediff[idx]), gs_ad_zygote[idx]; - kwargs...) - end - end - end - return -end