JuliaGPU · maleadt · Mar 27, 2020 · Mar 27, 2020 · Mar 27, 2020
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -42,16 +42,13 @@ julia:1.4-debug:
     CI_CLONE_ARGS: '-b release-1.4'
     CI_BUILD_ARGS: 'BINARYBUILDER_LLVM_ASSERTS=1 debug'
 
-# julia:nightly:
-#   extends:
-#     - .julia:nightly
-#     - .test
-#   tags:
-#     - nvidia
-#     - sm_75
-#   variables:
-#     CI_THOROUGH: 'true'
-#   allow_failure: true
+julia:nightly:
+  extends:
+    - .julia:nightly
+    - .test
+  tags:
+    - nvidia
+  allow_failure: true
 
 
 # CUDA versions

diff --git a/Project.toml b/Project.toml
@@ -26,7 +26,7 @@ CUDAapi = "3.1, 4.0"
 CUDAdrv = "6.2.1"
 Cthulhu = "1.0"
 DataStructures = "0.15, 0.16, 0.17"
-LLVM = "1.2"
+LLVM = "1.3.4"
 MacroTools = "0.5"
 TimerOutputs = "0.5"
 julia = "1.3"

diff --git a/src/compiler/irgen.jl b/src/compiler/irgen.jl
@@ -47,6 +47,159 @@ Base.showerror(io::IO, err::MethodSubstitutionWarning) =
     print(io, "You called $(err.original), maybe you intended to call $(err.substitute) instead?")
 const method_substitution_whitelist = [:hypot]
 
+if VERSION >= v"1.5.0-DEV.393"
+
+# JuliaLang/julia#25984 significantly restructured the compiler
+
+# TODO: deduplicate some code
+
+function compile_method_instance(job::CompilerJob, method_instance::Core.MethodInstance, world)
+    # set-up the compiler interface
+    call_stack = [method_instance]
+    function hook_emit_function(method_instance, code)
+        push!(call_stack, method_instance)
+
+        # check for Base functions that exist in CUDAnative too
+        # FIXME: this might be too coarse
+        method = method_instance.def
+        if Base.moduleroot(method.module) == Base &&
+           isdefined(CUDAnative, method_instance.def.name) &&
+           !in(method_instance.def.name, method_substitution_whitelist)
+            substitute_function = getfield(CUDAnative, method.name)
+            tt = Tuple{method_instance.specTypes.parameters[2:end]...}
+            if hasmethod(substitute_function, tt)
+                method′ = which(substitute_function, tt)
+                if Base.moduleroot(method′.module) == CUDAnative
+                    @warn "calls to Base intrinsics might be GPU incompatible" exception=(MethodSubstitutionWarning(method, method′), backtrace(job, call_stack))
+                end
+            end
+        end
+    end
+    function hook_emitted_function(method, code)
+        @compiler_assert last(call_stack) == method job
+        pop!(call_stack)
+    end
+    param_kwargs = [:track_allocations  => false,
+                    :code_coverage      => false,
+                    :static_alloc       => false,
+                    :prefer_specsig     => true,
+                    :emit_function      => hook_emit_function,
+                    :emitted_function   => hook_emitted_function]
+    if LLVM.version() >= v"8.0" && VERSION >= v"1.3.0-DEV.547"
+        push!(param_kwargs, :gnu_pubnames => false)
+
+        debug_info_kind = if Base.JLOptions().debug_level == 0
+            LLVM.API.LLVMDebugEmissionKindNoDebug
+        elseif Base.JLOptions().debug_level == 1
+            LLVM.API.LLVMDebugEmissionKindLineTablesOnly
+        elseif Base.JLOptions().debug_level >= 2
+            LLVM.API.LLVMDebugEmissionKindFullDebug
+        end
+
+        #if CUDAdrv.release() < v"10.2"
+            # FIXME: LLVM's debug info crashes CUDA
+            # FIXME: this ought to be fixed on 10.2?
+            @debug "Incompatibility detected between CUDA and LLVM 8.0+; disabling debug info emission" maxlog=1
+            debug_info_kind = LLVM.API.LLVMDebugEmissionKindNoDebug
+        #end
+
+        push!(param_kwargs, :debug_info_kind => Cint(debug_info_kind))
+    end
+    params = Base.CodegenParams(;param_kwargs...)
+
+    # generate IR
+    native_code = ccall(:jl_create_native, Ptr{Cvoid},
+                        (Vector{Core.MethodInstance}, Base.CodegenParams),
+                        [method_instance], params)
+    @assert native_code != C_NULL
+    llvm_mod_ref = ccall(:jl_get_llvm_module, LLVM.API.LLVMModuleRef,
+                         (Ptr{Cvoid},), native_code)
+    @assert llvm_mod_ref != C_NULL
+    llvm_mod = LLVM.Module(llvm_mod_ref)
+
+    # get the top-level code
+    code = Core.Compiler.inf_for_methodinstance(method_instance, world, world)
+
+    # get the top-level function index
+    llvm_func_idx = Ref{Int32}(-1)
+    llvm_specfunc_idx = Ref{Int32}(-1)
+    ccall(:jl_breakpoint, Nothing, ())
+    ccall(:jl_get_function_id, Nothing,
+          (Ptr{Cvoid}, Any, Ptr{Int32}, Ptr{Int32}),
+          native_code, code, llvm_func_idx, llvm_specfunc_idx)
+    @assert llvm_func_idx[] != -1
+    @assert llvm_specfunc_idx[] != -1
+
+    # get the top-level function)
+    llvm_func_ref = ccall(:jl_get_llvm_function, LLVM.API.LLVMValueRef,
+                     (Ptr{Cvoid}, UInt32), native_code, llvm_func_idx[]-1)
+    @assert llvm_func_ref != C_NULL
+    llvm_func = LLVM.Function(llvm_func_ref)
+    llvm_specfunc_ref = ccall(:jl_get_llvm_function, LLVM.API.LLVMValueRef,
+                         (Ptr{Cvoid}, UInt32), native_code, llvm_specfunc_idx[]-1)
+    @assert llvm_specfunc_ref != C_NULL
+    llvm_specfunc = LLVM.Function(llvm_specfunc_ref)
+
+    # configure the module
+    # NOTE: NVPTX::TargetMachine's data layout doesn't match the NVPTX user guide,
+    #       so we specify it ourselves
+    if Int === Int64
+        triple!(llvm_mod, "nvptx64-nvidia-cuda")
+        datalayout!(llvm_mod, "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64")
+    else
+        triple!(llvm_mod, "nvptx-nvidia-cuda")
+        datalayout!(llvm_mod, "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64")
+    end
+
+    return llvm_specfunc, llvm_mod
+end
+
+function irgen(job::CompilerJob, method_instance::Core.MethodInstance, world)
+    entry, mod = @timeit_debug to "emission" compile_method_instance(job, method_instance, world)
+
+    # clean up incompatibilities
+    @timeit_debug to "clean-up" for llvmf in functions(mod)
+        # only occurs in debug builds
+        delete!(function_attributes(llvmf), EnumAttribute("sspstrong", 0, JuliaContext()))
+    end
+
+    # add the global exception indicator flag
+    emit_exception_flag!(mod)
+
+    # rename the entry point
+    if job.name !== nothing
+        llvmfn = safe_name(string("julia_", job.name))
+    else
+        # strip the globalUnique counter
+        llvmfn = LLVM.name(entry)
+    end
+    LLVM.name!(entry, llvmfn)
+
+    # promote entry-points to kernels and mangle its name
+    if job.kernel
+        entry = promote_kernel!(job, mod, entry)
+        LLVM.name!(entry, mangle_call(entry, job.tt))
+    end
+
+    # minimal required optimization
+    @timeit_debug to "rewrite" ModulePassManager() do pm
+        global current_job
+        current_job = job
+
+        linkage!(entry, LLVM.API.LLVMExternalLinkage)
+        internalize!(pm, [LLVM.name(entry)])
+
+        add!(pm, ModulePass("LowerThrow", lower_throw!))
+        add!(pm, FunctionPass("HideUnreachable", hide_unreachable!))
+        add!(pm, ModulePass("HideTrap", hide_trap!))
+        run!(pm, mod)
+    end
+
+    return mod, entry
+end
+
+else
+
 function compile_method_instance(job::CompilerJob, method_instance::Core.MethodInstance, world)
     function postprocess(ir)
         # get rid of jfptr wrappers
@@ -210,33 +363,13 @@ function irgen(job::CompilerJob, method_instance::Core.MethodInstance, world)
 
     # clean up incompatibilities
     @timeit_debug to "clean-up" for llvmf in functions(mod)
-        llvmfn = LLVM.name(llvmf)
-
         # only occurs in debug builds
         delete!(function_attributes(llvmf), EnumAttribute("sspstrong", 0, JuliaContext()))
 
-        # rename functions
+        # make function names safe for ptxas
+        # (LLVM should to do this, but fails, see eg. D17738 and D19126)
+        llvmfn = LLVM.name(llvmf)
         if !isdeclaration(llvmf)
-            # Julia disambiguates local functions by prefixing with `#\d#`.
-            # since we don't use a global function namespace, get rid of those tags.
-            if occursin(r"^julia_#\d+#", llvmfn)
-                llvmfn′ = replace(llvmfn, r"#\d+#"=>"")
-                if !haskey(functions(mod), llvmfn′)
-                    LLVM.name!(llvmf, llvmfn′)
-                    llvmfn = llvmfn′
-                end
-            end
-
-            # anonymous functions are just named `#\d`, make that somewhat more readable
-            m = match(r"_#(\d+)_", llvmfn)
-            if m !== nothing
-                llvmfn′ = replace(llvmfn, m.match=>"_anonymous$(m.captures[1])_")
-                LLVM.name!(llvmf, llvmfn′)
-                llvmfn = llvmfn′
-            end
-
-            # finally, make function names safe for ptxas
-            # (LLVM should to do this, but fails, see eg. D17738 and D19126)
             llvmfn′ = safe_name(llvmfn)
             if llvmfn != llvmfn′
                 LLVM.name!(llvmf, llvmfn′)
@@ -280,6 +413,8 @@ function irgen(job::CompilerJob, method_instance::Core.MethodInstance, world)
     return mod, entry
 end
 
+end
+
 
 ## name mangling
 

diff --git a/test/base.jl b/test/base.jl
@@ -2,6 +2,7 @@
 
 ############################################################################################
 
+if VERSION < v"1.5.0-DEV.393"
 @testset "method caching" begin
 
 import InteractiveUtils: _dump_function
@@ -27,6 +28,7 @@ else
                    params)
 end
 
+end
 end
 
 ############################################################################################

diff --git a/test/codegen.jl b/test/codegen.jl
@@ -11,7 +11,7 @@
     ir = sprint(io->CUDAnative.code_llvm(io, valid_kernel, Tuple{}; optimize=false, dump_module=true))
 
     # module should contain our function + a generic call wrapper
-    @test occursin(r"define void @.*julia_valid_kernel.*\(\)", ir)
+    @test occursin(r"define\ .* void\ @.*julia_valid_kernel.*\(\)"x, ir)
     @test !occursin("define %jl_value_t* @jlcall_", ir)
 
     # there should be no debug metadata
@@ -130,21 +130,6 @@ end
     CUDAnative.code_llvm(devnull, D32593, Tuple{CuDeviceVector{D32593_struct,AS.Global}})
 end
 
-@testset "kernel names" begin
-    regular() = return
-    closure = ()->return
-
-    function test_name(f, name; kwargs...)
-        code = sprint(io->CUDAnative.code_llvm(io, f, Tuple{}; kwargs...))
-        @test occursin(name, code)
-    end
-
-    test_name(regular, "julia_regular")
-    test_name(regular, "julia_regular"; kernel=true)
-    test_name(closure, "julia_anonymous")
-    test_name(closure, "julia_anonymous"; kernel=true)
-end
-
 @testset "PTX TBAA" begin
     load(ptr) = unsafe_load(ptr)
     store(ptr) = unsafe_store!(ptr, 0)
@@ -256,7 +241,7 @@ end
     end
 
     asm = sprint(io->CUDAnative.code_ptx(io, parent, Tuple{Int64}))
-    @test occursin(r"call.uni\s+julia_child_"m, asm)
+    @test occursin(r"call.uni\s+julia_.*child_"m, asm)
 end
 
 @testset "kernel functions" begin
@@ -314,15 +299,15 @@ end
     end
 
     asm = sprint(io->CUDAnative.code_ptx(io, parent1, Tuple{Int}))
-    @test occursin(r".func julia_child_", asm)
+    @test occursin(r".func julia_.*child_", asm)
 
     function parent2(i)
         child(i+1)
         return
     end
 
     asm = sprint(io->CUDAnative.code_ptx(io, parent2, Tuple{Int}))
-    @test occursin(r".func julia_child_", asm)
+    @test occursin(r".func julia_.*child_", asm)
 end
 
 @testset "child function reuse bis" begin
@@ -386,21 +371,6 @@ end
     CUDAnative.code_ptx(devnull, kernel, Tuple{Float64})
 end
 
-@testset "kernel names" begin
-    regular() = nothing
-    closure = ()->nothing
-
-    function test_name(f, name; kwargs...)
-        code = sprint(io->CUDAnative.code_ptx(io, f, Tuple{}; kwargs...))
-        @test occursin(name, code)
-    end
-
-    test_name(regular, "julia_regular")
-    test_name(regular, "julia_regular"; kernel=true)
-    test_name(closure, "julia_anonymous")
-    test_name(closure, "julia_anonymous"; kernel=true)
-end
-
 @testset "exception arguments" begin
     function kernel(a)
         unsafe_store!(a, trunc(Int, unsafe_load(a)))
@@ -478,18 +448,6 @@ end
 
 # some validation happens in the emit_function hook, which is called by code_llvm
 
-@testset "recursion" begin
-    @eval recurse_outer(i) = i > 0 ? i : recurse_inner(i)
-    @eval @noinline recurse_inner(i) = i < 0 ? i : recurse_outer(i)
-
-    @test_throws_message(CUDAnative.KernelError, CUDAnative.code_llvm(devnull, recurse_outer, Tuple{Int})) do msg
-        occursin("recursion is currently not supported", msg) &&
-        occursin("[1] recurse_outer", msg) &&
-        occursin("[2] recurse_inner", msg) &&
-        occursin("[3] recurse_outer", msg)
-    end
-end
-
 @testset "base intrinsics" begin
     foobar(i) = sin(i)