From 5bef492dae0deb89769deb38bb1b90a2bfe18dff Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 27 Mar 2020 09:14:38 +0100 Subject: [PATCH 1/2] Adapt to restructured codegen. --- .gitlab-ci.yml | 17 ++-- Project.toml | 2 +- src/compiler/irgen.jl | 181 ++++++++++++++++++++++++++++++++++++------ test/base.jl | 2 + test/codegen.jl | 50 +----------- test/device/wmma.jl | 26 +++--- test/util.jl | 11 +++ 7 files changed, 197 insertions(+), 92 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4b277566..98a87fbf 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -42,16 +42,13 @@ julia:1.4-debug: CI_CLONE_ARGS: '-b release-1.4' CI_BUILD_ARGS: 'BINARYBUILDER_LLVM_ASSERTS=1 debug' -# julia:nightly: -# extends: -# - .julia:nightly -# - .test -# tags: -# - nvidia -# - sm_75 -# variables: -# CI_THOROUGH: 'true' -# allow_failure: true +julia:nightly: + extends: + - .julia:nightly + - .test + tags: + - nvidia + allow_failure: true # CUDA versions diff --git a/Project.toml b/Project.toml index 1b509a5b..c240ef05 100644 --- a/Project.toml +++ b/Project.toml @@ -26,7 +26,7 @@ CUDAapi = "3.1, 4.0" CUDAdrv = "6.2.1" Cthulhu = "1.0" DataStructures = "0.15, 0.16, 0.17" -LLVM = "1.2" +LLVM = "1.3.4" MacroTools = "0.5" TimerOutputs = "0.5" julia = "1.3" diff --git a/src/compiler/irgen.jl b/src/compiler/irgen.jl index 060d1f7b..52f46d37 100644 --- a/src/compiler/irgen.jl +++ b/src/compiler/irgen.jl @@ -47,6 +47,159 @@ Base.showerror(io::IO, err::MethodSubstitutionWarning) = print(io, "You called $(err.original), maybe you intended to call $(err.substitute) instead?") const method_substitution_whitelist = [:hypot] +if VERSION >= v"1.5.0-DEV.393" + +# JuliaLang/julia#25984 significantly restructured the compiler + +# TODO: deduplicate some code + +function compile_method_instance(job::CompilerJob, method_instance::Core.MethodInstance, world) + # set-up the compiler interface + call_stack = [method_instance] + function hook_emit_function(method_instance, code) + push!(call_stack, method_instance) + + # check for Base functions that exist in CUDAnative too + # FIXME: this might be too coarse + method = method_instance.def + if Base.moduleroot(method.module) == Base && + isdefined(CUDAnative, method_instance.def.name) && + !in(method_instance.def.name, method_substitution_whitelist) + substitute_function = getfield(CUDAnative, method.name) + tt = Tuple{method_instance.specTypes.parameters[2:end]...} + if hasmethod(substitute_function, tt) + method′ = which(substitute_function, tt) + if Base.moduleroot(method′.module) == CUDAnative + @warn "calls to Base intrinsics might be GPU incompatible" exception=(MethodSubstitutionWarning(method, method′), backtrace(job, call_stack)) + end + end + end + end + function hook_emitted_function(method, code) + @compiler_assert last(call_stack) == method job + pop!(call_stack) + end + param_kwargs = [:track_allocations => false, + :code_coverage => false, + :static_alloc => false, + :prefer_specsig => true, + :emit_function => hook_emit_function, + :emitted_function => hook_emitted_function] + if LLVM.version() >= v"8.0" && VERSION >= v"1.3.0-DEV.547" + push!(param_kwargs, :gnu_pubnames => false) + + debug_info_kind = if Base.JLOptions().debug_level == 0 + LLVM.API.LLVMDebugEmissionKindNoDebug + elseif Base.JLOptions().debug_level == 1 + LLVM.API.LLVMDebugEmissionKindLineTablesOnly + elseif Base.JLOptions().debug_level >= 2 + LLVM.API.LLVMDebugEmissionKindFullDebug + end + + #if CUDAdrv.release() < v"10.2" + # FIXME: LLVM's debug info crashes CUDA + # FIXME: this ought to be fixed on 10.2? + @debug "Incompatibility detected between CUDA and LLVM 8.0+; disabling debug info emission" maxlog=1 + debug_info_kind = LLVM.API.LLVMDebugEmissionKindNoDebug + #end + + push!(param_kwargs, :debug_info_kind => Cint(debug_info_kind)) + end + params = Base.CodegenParams(;param_kwargs...) + + # generate IR + native_code = ccall(:jl_create_native, Ptr{Cvoid}, + (Vector{Core.MethodInstance}, Base.CodegenParams), + [method_instance], params) + @assert native_code != C_NULL + llvm_mod_ref = ccall(:jl_get_llvm_module, LLVM.API.LLVMModuleRef, + (Ptr{Cvoid},), native_code) + @assert llvm_mod_ref != C_NULL + llvm_mod = LLVM.Module(llvm_mod_ref) + + # get the top-level code + code = Core.Compiler.inf_for_methodinstance(method_instance, world, world) + + # get the top-level function index + llvm_func_idx = Ref{Int32}(-1) + llvm_specfunc_idx = Ref{Int32}(-1) + ccall(:jl_breakpoint, Nothing, ()) + ccall(:jl_get_function_id, Nothing, + (Ptr{Cvoid}, Any, Ptr{Int32}, Ptr{Int32}), + native_code, code, llvm_func_idx, llvm_specfunc_idx) + @assert llvm_func_idx[] != -1 + @assert llvm_specfunc_idx[] != -1 + + # get the top-level function) + llvm_func_ref = ccall(:jl_get_llvm_function, LLVM.API.LLVMValueRef, + (Ptr{Cvoid}, UInt32), native_code, llvm_func_idx[]-1) + @assert llvm_func_ref != C_NULL + llvm_func = LLVM.Function(llvm_func_ref) + llvm_specfunc_ref = ccall(:jl_get_llvm_function, LLVM.API.LLVMValueRef, + (Ptr{Cvoid}, UInt32), native_code, llvm_specfunc_idx[]-1) + @assert llvm_specfunc_ref != C_NULL + llvm_specfunc = LLVM.Function(llvm_specfunc_ref) + + # configure the module + # NOTE: NVPTX::TargetMachine's data layout doesn't match the NVPTX user guide, + # so we specify it ourselves + if Int === Int64 + triple!(llvm_mod, "nvptx64-nvidia-cuda") + datalayout!(llvm_mod, "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64") + else + triple!(llvm_mod, "nvptx-nvidia-cuda") + datalayout!(llvm_mod, "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64") + end + + return llvm_specfunc, llvm_mod +end + +function irgen(job::CompilerJob, method_instance::Core.MethodInstance, world) + entry, mod = @timeit_debug to "emission" compile_method_instance(job, method_instance, world) + + # clean up incompatibilities + @timeit_debug to "clean-up" for llvmf in functions(mod) + # only occurs in debug builds + delete!(function_attributes(llvmf), EnumAttribute("sspstrong", 0, JuliaContext())) + end + + # add the global exception indicator flag + emit_exception_flag!(mod) + + # rename the entry point + if job.name !== nothing + llvmfn = safe_name(string("julia_", job.name)) + else + # strip the globalUnique counter + llvmfn = LLVM.name(entry) + end + LLVM.name!(entry, llvmfn) + + # promote entry-points to kernels and mangle its name + if job.kernel + entry = promote_kernel!(job, mod, entry) + LLVM.name!(entry, mangle_call(entry, job.tt)) + end + + # minimal required optimization + @timeit_debug to "rewrite" ModulePassManager() do pm + global current_job + current_job = job + + linkage!(entry, LLVM.API.LLVMExternalLinkage) + internalize!(pm, [LLVM.name(entry)]) + + add!(pm, ModulePass("LowerThrow", lower_throw!)) + add!(pm, FunctionPass("HideUnreachable", hide_unreachable!)) + add!(pm, ModulePass("HideTrap", hide_trap!)) + run!(pm, mod) + end + + return mod, entry +end + +else + function compile_method_instance(job::CompilerJob, method_instance::Core.MethodInstance, world) function postprocess(ir) # get rid of jfptr wrappers @@ -210,33 +363,13 @@ function irgen(job::CompilerJob, method_instance::Core.MethodInstance, world) # clean up incompatibilities @timeit_debug to "clean-up" for llvmf in functions(mod) - llvmfn = LLVM.name(llvmf) - # only occurs in debug builds delete!(function_attributes(llvmf), EnumAttribute("sspstrong", 0, JuliaContext())) - # rename functions + # make function names safe for ptxas + # (LLVM should to do this, but fails, see eg. D17738 and D19126) + llvmfn = LLVM.name(llvmf) if !isdeclaration(llvmf) - # Julia disambiguates local functions by prefixing with `#\d#`. - # since we don't use a global function namespace, get rid of those tags. - if occursin(r"^julia_#\d+#", llvmfn) - llvmfn′ = replace(llvmfn, r"#\d+#"=>"") - if !haskey(functions(mod), llvmfn′) - LLVM.name!(llvmf, llvmfn′) - llvmfn = llvmfn′ - end - end - - # anonymous functions are just named `#\d`, make that somewhat more readable - m = match(r"_#(\d+)_", llvmfn) - if m !== nothing - llvmfn′ = replace(llvmfn, m.match=>"_anonymous$(m.captures[1])_") - LLVM.name!(llvmf, llvmfn′) - llvmfn = llvmfn′ - end - - # finally, make function names safe for ptxas - # (LLVM should to do this, but fails, see eg. D17738 and D19126) llvmfn′ = safe_name(llvmfn) if llvmfn != llvmfn′ LLVM.name!(llvmf, llvmfn′) @@ -280,6 +413,8 @@ function irgen(job::CompilerJob, method_instance::Core.MethodInstance, world) return mod, entry end +end + ## name mangling diff --git a/test/base.jl b/test/base.jl index 5d2c3bc1..23801739 100644 --- a/test/base.jl +++ b/test/base.jl @@ -2,6 +2,7 @@ ############################################################################################ +if VERSION < v"1.5.0-DEV.393" @testset "method caching" begin import InteractiveUtils: _dump_function @@ -27,6 +28,7 @@ else params) end +end end ############################################################################################ diff --git a/test/codegen.jl b/test/codegen.jl index 56620a40..8ee59ae5 100644 --- a/test/codegen.jl +++ b/test/codegen.jl @@ -11,7 +11,7 @@ ir = sprint(io->CUDAnative.code_llvm(io, valid_kernel, Tuple{}; optimize=false, dump_module=true)) # module should contain our function + a generic call wrapper - @test occursin(r"define void @.*julia_valid_kernel.*\(\)", ir) + @test occursin(r"define\ .* void\ @.*julia_valid_kernel.*\(\)"x, ir) @test !occursin("define %jl_value_t* @jlcall_", ir) # there should be no debug metadata @@ -130,21 +130,6 @@ end CUDAnative.code_llvm(devnull, D32593, Tuple{CuDeviceVector{D32593_struct,AS.Global}}) end -@testset "kernel names" begin - regular() = return - closure = ()->return - - function test_name(f, name; kwargs...) - code = sprint(io->CUDAnative.code_llvm(io, f, Tuple{}; kwargs...)) - @test occursin(name, code) - end - - test_name(regular, "julia_regular") - test_name(regular, "julia_regular"; kernel=true) - test_name(closure, "julia_anonymous") - test_name(closure, "julia_anonymous"; kernel=true) -end - @testset "PTX TBAA" begin load(ptr) = unsafe_load(ptr) store(ptr) = unsafe_store!(ptr, 0) @@ -256,7 +241,7 @@ end end asm = sprint(io->CUDAnative.code_ptx(io, parent, Tuple{Int64})) - @test occursin(r"call.uni\s+julia_child_"m, asm) + @test occursin(r"call.uni\s+julia_.*child_"m, asm) end @testset "kernel functions" begin @@ -314,7 +299,7 @@ end end asm = sprint(io->CUDAnative.code_ptx(io, parent1, Tuple{Int})) - @test occursin(r".func julia_child_", asm) + @test occursin(r".func julia_.*child_", asm) function parent2(i) child(i+1) @@ -322,7 +307,7 @@ end end asm = sprint(io->CUDAnative.code_ptx(io, parent2, Tuple{Int})) - @test occursin(r".func julia_child_", asm) + @test occursin(r".func julia_.*child_", asm) end @testset "child function reuse bis" begin @@ -386,21 +371,6 @@ end CUDAnative.code_ptx(devnull, kernel, Tuple{Float64}) end -@testset "kernel names" begin - regular() = nothing - closure = ()->nothing - - function test_name(f, name; kwargs...) - code = sprint(io->CUDAnative.code_ptx(io, f, Tuple{}; kwargs...)) - @test occursin(name, code) - end - - test_name(regular, "julia_regular") - test_name(regular, "julia_regular"; kernel=true) - test_name(closure, "julia_anonymous") - test_name(closure, "julia_anonymous"; kernel=true) -end - @testset "exception arguments" begin function kernel(a) unsafe_store!(a, trunc(Int, unsafe_load(a))) @@ -478,18 +448,6 @@ end # some validation happens in the emit_function hook, which is called by code_llvm -@testset "recursion" begin - @eval recurse_outer(i) = i > 0 ? i : recurse_inner(i) - @eval @noinline recurse_inner(i) = i < 0 ? i : recurse_outer(i) - - @test_throws_message(CUDAnative.KernelError, CUDAnative.code_llvm(devnull, recurse_outer, Tuple{Int})) do msg - occursin("recursion is currently not supported", msg) && - occursin("[1] recurse_outer", msg) && - occursin("[2] recurse_inner", msg) && - occursin("[3] recurse_outer", msg) - end -end - @testset "base intrinsics" begin foobar(i) = sin(i) diff --git a/test/device/wmma.jl b/test/device/wmma.jl index 6c1382c6..75aaac7e 100644 --- a/test/device/wmma.jl +++ b/test/device/wmma.jl @@ -231,18 +231,20 @@ is_debug ? @warn("Skipping WMMA tests due to incompatible Julia") : @testset "WM return end - @cuda threads=32 kernel(a_dev, b_dev, c_dev, d_dev, alpha, beta) - d = Array(d_dev) - - new_a = (a_layout == ColMajor) ? a : transpose(a) - new_b = (b_layout == ColMajor) ? b : transpose(b) - new_c = (c_layout == ColMajor) ? c : transpose(c) - new_d = (d_layout == ColMajor) ? d : transpose(d) - - if do_mac - @test all(isapprox.(alpha * new_a * new_b + beta * new_c, new_d; rtol=sqrt(eps(Float16)))) - else - @test all(isapprox.(alpha * new_a * new_b, new_d; rtol=sqrt(eps(Float16)))) + @test_broken_if VERSION >= v"1.5.0-DEV.393" begin + @cuda threads=32 kernel(a_dev, b_dev, c_dev, d_dev, alpha, beta) + d = Array(d_dev) + + new_a = (a_layout == ColMajor) ? a : transpose(a) + new_b = (b_layout == ColMajor) ? b : transpose(b) + new_c = (c_layout == ColMajor) ? c : transpose(c) + new_d = (d_layout == ColMajor) ? d : transpose(d) + + if do_mac + all(isapprox.(alpha * new_a * new_b + beta * new_c, new_d; rtol=sqrt(eps(Float16)))) + else + all(isapprox.(alpha * new_a * new_b, new_d; rtol=sqrt(eps(Float16)))) + end end end diff --git a/test/util.jl b/test/util.jl index 34986520..05ed6d41 100644 --- a/test/util.jl +++ b/test/util.jl @@ -101,3 +101,14 @@ function julia_script(code, args=``) wait(proc) proc.exitcode, read(out, String), read(err, String) end + +# tests that are conditionall broken +macro test_broken_if(cond, ex...) + quote + if $(esc(cond)) + @test_broken $(map(esc, ex)...) + else + @test $(map(esc, ex)...) + end + end +end From 50b7f30a792d2547bfe87a1c4d3f733f0fbe8761 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 27 Mar 2020 09:40:09 +0100 Subject: [PATCH 2/2] Fix WMMA codegen tests for below PTX 6.3. --- test/device/wmma.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/device/wmma.jl b/test/device/wmma.jl index 75aaac7e..343fc47c 100644 --- a/test/device/wmma.jl +++ b/test/device/wmma.jl @@ -268,8 +268,8 @@ if VERSION >= v"1.5.0-DEV.324" ptx = sprint(io -> CUDAnative.code_ptx(io, kernel, (CuDeviceArray{Float32,1,CUDAnative.AS.Global},))) - @test !occursin("wmma.store.d.sync.aligned.col.m16n16k16.f32", ptx) - @test occursin("wmma.store.d.sync.aligned.col.m16n16k16.global.f32", ptx) + @test !occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.f32", ptx) + @test occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.global.f32", ptx) end @testset "Shared" begin @@ -285,8 +285,8 @@ if VERSION >= v"1.5.0-DEV.324" ptx = sprint(io -> CUDAnative.code_ptx(io, kernel, ())) - @test !occursin("wmma.store.d.sync.aligned.col.m16n16k16.f32", ptx) - @test occursin("wmma.store.d.sync.aligned.col.m16n16k16.shared.f32", ptx) + @test !occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.f32", ptx) + @test occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.shared.f32", ptx) end end end