Skip to content

Commit

Permalink
AMDGPU JIT compiler (#453)
Browse files Browse the repository at this point in the history
* feat(AMD GPU): initial commit to support AMD GPU (working for ASM but not object code)

* feat(AMD GPU): 🔥 🔥 end to end JIT compilation to AMD GPU is working
  • Loading branch information
mratsim authored Aug 5, 2024
1 parent cfe077b commit 1e34ec2
Show file tree
Hide file tree
Showing 13 changed files with 2,093 additions and 36 deletions.
188 changes: 188 additions & 0 deletions constantine/math_compiler/codegen_amdgpu.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.

import
constantine/platforms/abis/amdgpu_abi {.all.},
constantine/platforms/abis/amdcomgr_abi,
constantine/platforms/abis/c_abi,
constantine/platforms/llvm/llvm,
constantine/platforms/primitives,
./ir

export
amdgpu_abi,
Flag, flag, wrapOpenArrayLenType

# ############################################################
#
# AMD GPUs API
#
# ############################################################

# Hip Runtime API
# ------------------------------------------------------------

template check*(status: HipError) =
## Check the status code of a Hip operation
## Exit program with error if failure

let code = status # ensure that the input expression is evaluated once only
if code != hipSuccess:
writeStackTrace()
stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n')
quit 1

func hipModuleGetFunction*(kernel: var HipFunction, module: HipModule, fnName: openArray[char]): HipError {.inline.}=
hipModuleGetFunction(kernel, module, fnName[0].unsafeAddr)

proc getGcnArchName*(deviceID: int32): string =
var prop: HipDeviceProp
check hipGetDeviceProperties(prop, deviceID)

for c in prop.gcnArchName:
if c != '\0':
result.add c

proc hipDeviceInit*(deviceID = 0'i32): HipDevice =

check hipInit(deviceID.uint32)

var devCount: int32
check hipGetDeviceCount(devCount)
if devCount == 0:
echo "hipDeviceInit error: no devices supporting AMD ROCm/HIP"
quit 1

var hipDevice: HipDevice
check hipDeviceGet(hipDevice, deviceID)
var name = newString(128)
check hipDeviceGetName(name[0].addr, name.len.int32, hipDevice)
echo "Using HIP Device [", deviceID, "]: ", cstring(name)
echo "AMD GCN ARCH: ", deviceID.getGcnArchName()

return hipDevice

# ############################################################
#
# LLVM IR for AMD GPUs
#
# ############################################################
#
# Note:
# __device__ functions for field and elliptic curve arithmetic
# might be compiled by default with scalar codegen
#
# We will need to either:
# - Derive explicitly a vectorized version of the warp/wave size (32)
# - Derive implicitly a vectorized version, probably with __forceinline__

proc wrapInCallableHipKernel*(module: ModuleRef, fn: FnDef) =
## Create a public wrapper of a Hip device function
##
## A function named `addmod` can be found by appending _public
## check hipModuleGetFunction(fnPointer, cuModule, "addmod_public")

let pubName = fn.fnImpl.getName() & "_public"
let pubFn = module.addFunction(cstring(pubName), fn.fnTy)

let ctx = module.getContext()
let builder = ctx.createBuilder()
defer: builder.dispose()

let blck = ctx.appendBasicBlock(pubFn, "publicKernelBody")
builder.positionAtEnd(blck)

var args = newSeq[ValueRef](fn.fnTy.countParamTypes())
for i, arg in mpairs(args):
arg = pubFn.getParam(i.uint32)
discard builder.call2(fn.fnTy, fn.fnImpl, args)

# A public kernel must return void
builder.retVoid()
pubFn.setCallingConvention(AMDGPU_KERNEL)

# ############################################################
#
# Code generation
#
# ############################################################

template check*(status: ComgrStatus) =
## Check the status code of a Comgr operation
## Exit program with error if failure

let code = status # ensure that the input expression is evaluated once only
if code != AMD_COMGR_STATUS_SUCCESS:
writeStackTrace()
stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n')
quit 1


proc linkAmdGpu*(reloc_obj: seq[byte], gcnArchName: string): seq[byte] {.noInline.} =
## Link a relocatable object code
## into an executable that can be used through hipModuleLoadData
var roc: ComgrData
check amd_comgr_create_data(AMD_COMGR_DATA_KIND_RELOCATABLE, roc)
defer: check amd_comgr_release_data(roc)

var ai: ComgrActionInfo
check amd_comgr_create_action_info(ai)
defer: check amd_comgr_destroy_action_info(ai)

var ds: ComgrDataset
check amd_comgr_create_data_set(ds)
defer: check amd_comgr_destroy_data_set(ds)

var dsOut: ComgrDataset
check amd_comgr_create_data_set(dsOut)
defer: check amd_comgr_destroy_data_set(dsOut)

check roc.amd_comgr_set_data(reloc_obj.len.csize_t(), reloc_obj[0].addr)
check roc.amd_comgr_set_data_name("linkAmdGpu-input.o")
check ds.amd_comgr_data_set_add(roc)

check ai.amd_comgr_action_info_set_isa_name(
cstring("amdgcn-amd-amdhsa--" & gcnArchName)
)

check amd_comgr_do_action(
AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE,
info = ai,
input = ds,
output = dsOut)

# Extract the executable
# ------------------------------------------------

var exe: ComgrData
check amd_comgr_create_data(AMD_COMGR_DATA_KIND_EXECUTABLE, exe)
defer: check amd_comgr_release_data(exe)

check amd_comgr_action_data_get_data(
dsOut, AMD_COMGR_DATA_KIND_EXECUTABLE,
index = 0, exe)

# Query the required buffer size
var size: csize_t
check amd_comgr_get_data(
exe, size, nil)

# Size includes nul char
# But we use seq[byte] not a string, so Nim doesn't auto-inster a \0
# Hence allocation size is exact.
result.setLen(int size)

check amd_comgr_get_data(
exe, size, result[0].addr)


# ############################################################
#
# Code execution
#
# ############################################################
12 changes: 6 additions & 6 deletions constantine/math_compiler/codegen_nvidia.nim
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,15 @@
# at your option. This file may not be copied, modified, or distributed except according to those terms.

import
constantine/platforms/llvm/bindings/nvidia_abi {.all.},
constantine/platforms/llvm/bindings/c_abi,
constantine/platforms/abis/nvidia_abi {.all.},
constantine/platforms/abis/c_abi,
constantine/platforms/llvm/[llvm, nvidia_inlineasm],
constantine/platforms/primitives,
./ir

export
nvidia_abi, nvidia_inlineasm,
Flag, flag
Flag, flag, wrapOpenArrayLenType

# ############################################################
#
Expand Down Expand Up @@ -131,7 +131,7 @@ proc tagCudaKernel(module: ModuleRef, fn: FnDef) =
]))
)

proc setCallableCudaKernel*(module: ModuleRef, fn: FnDef) =
proc wrapInCallableCudaKernel*(module: ModuleRef, fn: FnDef) =
## Create a public wrapper of a cuda device function
##
## A function named `addmod` can be found by appending _public
Expand Down Expand Up @@ -202,7 +202,7 @@ proc codegenNvidiaPTX*(asy: Assembler_LLVM, sm: tuple[major, minor: int32]): str
errMsg.dispose()
quit 1

return machine.emitToString(asy.module, AssemblyFile)
return machine.emitTo[:string](asy.module, AssemblyFile)

# ############################################################
#
Expand Down Expand Up @@ -263,4 +263,4 @@ proc exec*[T](jitFn: CUfunction, r: var T, a, b: T) =

check cuMemFree(rGPU)
check cuMemFree(aGPU)
check cuMemFree(bGPU)
check cuMemFree(bGPU)
Loading

0 comments on commit 1e34ec2

Please sign in to comment.