From 25092a398299abe2075b9333bb451f9cd480ac83 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 11 Oct 2024 10:41:47 +0200 Subject: [PATCH] Initial RISC-V support. Co-authored-by: Alex Fan --- Make.inc | 13 +- base/binaryplatforms.jl | 5 +- base/cpuid.jl | 3 + cli/trampolines/trampolines_riscv64.S | 20 ++ contrib/generate_precompile.jl | 9 +- contrib/normalize_triplet.py | 1 + doc/src/devdocs/build/build.md | 1 + doc/src/devdocs/build/riscv.md | 103 +++++++++ src/abi_riscv.cpp | 315 ++++++++++++++++++++++++++ src/aotcompile.cpp | 3 +- src/ccall.cpp | 3 + src/codegen.cpp | 7 +- src/disasm.cpp | 2 + src/jitlayers.cpp | 26 ++- src/jitlayers.h | 4 + src/julia_internal.h | 9 +- src/julia_threads.h | 2 +- src/llvm-ptls.cpp | 2 + src/llvm-version.h | 4 + src/runtime_intrinsics.c | 4 +- src/signal-handling.c | 19 +- src/signals-unix.c | 8 + src/stackwalk.c | 39 ++++ src/support/platform.h | 3 + src/task.c | 8 + src/threading.c | 4 +- 26 files changed, 599 insertions(+), 18 deletions(-) create mode 100644 cli/trampolines/trampolines_riscv64.S create mode 100644 doc/src/devdocs/build/riscv.md create mode 100644 src/abi_riscv.cpp diff --git a/Make.inc b/Make.inc index 53aee8a269732..cb79e3ca1b5a9 100644 --- a/Make.inc +++ b/Make.inc @@ -938,8 +938,12 @@ endif #If nothing is set default to native unless we are cross-compiling ifeq ($(MARCH)$(MCPU)$(MTUNE)$(JULIA_CPU_TARGET)$(XC_HOST),) -ifeq ($(ARCH),aarch64) #ARM recommends only setting MCPU for AArch64 +ifeq ($(ARCH),aarch64) +# ARM recommends only setting MCPU for AArch64 MCPU=native +else ifneq (,$(findstring riscv64,$(ARCH))) +# RISC-V doesn't have a native option +$(error Building for RISC-V requires a specific MARCH to be set)) else MARCH=native MTUNE=native @@ -995,6 +999,9 @@ endif ifneq (,$(findstring arm,$(ARCH))) DIST_ARCH:=arm endif +ifneq (,$(findstring riscv64,$(ARCH))) +DIST_ARCH:=riscv64 +endif JULIA_BINARYDIST_FILENAME := julia-$(JULIA_COMMIT)-$(DIST_OS)$(DIST_ARCH) endif @@ -1018,8 +1025,12 @@ ifneq ($(MARCH),) CC += -march=$(MARCH) CXX += -march=$(MARCH) FC += -march=$(MARCH) +# On RISC-V, don't forward the MARCH ISA string to JULIA_CPU_TARGET, +# as it's always incompatible with LLVM's CPU target name parser. +ifeq (,$(findstring riscv64,$(ARCH))) JULIA_CPU_TARGET ?= $(MARCH) endif +endif # Set MCPU-specific flags ifneq ($(MCPU),) diff --git a/base/binaryplatforms.jl b/base/binaryplatforms.jl index c8a55c99a5724..a372137edeb98 100644 --- a/base/binaryplatforms.jl +++ b/base/binaryplatforms.jl @@ -597,7 +597,7 @@ const arch_mapping = Dict( "armv7l" => "arm(v7l)?", # if we just see `arm-linux-gnueabihf`, we assume it's `armv7l` "armv6l" => "armv6l", "powerpc64le" => "p(ower)?pc64le", - "riscv64" => "riscv64", + "riscv64" => "(rv64|riscv64)", ) # Keep this in sync with `CPUID.ISAs_by_family` # These are the CPUID side of the microarchitectures targeted by GCC flags in BinaryBuilder.jl @@ -631,6 +631,9 @@ const arch_march_isa_mapping = let "a64fx" => get_set("aarch64", "a64fx"), "apple_m1" => get_set("aarch64", "apple_m1"), ], + "riscv64" => [ + "riscv64" => get_set("riscv64", "riscv64") + ], "powerpc64le" => [ "power8" => get_set("powerpc64le", "power8"), ], diff --git a/base/cpuid.jl b/base/cpuid.jl index f653ba27b4bcd..0370bd33b83e5 100644 --- a/base/cpuid.jl +++ b/base/cpuid.jl @@ -61,6 +61,9 @@ const ISAs_by_family = Dict( "a64fx" => ISA(Set((JL_AArch64_v8_2a, JL_AArch64_lse, JL_AArch64_crc, JL_AArch64_rdm, JL_AArch64_sha2, JL_AArch64_ccpp, JL_AArch64_complxnum, JL_AArch64_fullfp16, JL_AArch64_sve))), "apple_m1" => ISA(Set((JL_AArch64_v8_5a, JL_AArch64_lse, JL_AArch64_crc, JL_AArch64_rdm, JL_AArch64_aes, JL_AArch64_sha2, JL_AArch64_sha3, JL_AArch64_ccpp, JL_AArch64_complxnum, JL_AArch64_fp16fml, JL_AArch64_fullfp16, JL_AArch64_dotprod, JL_AArch64_rcpc, JL_AArch64_altnzcv))), ], + "riscv64" => [ + "riscv64" => ISA(Set{UInt32}()), + ], "powerpc64le" => [ # We have no way to test powerpc64le features yet, so we're only going to declare the lowest ISA: "power8" => ISA(Set{UInt32}()), diff --git a/cli/trampolines/trampolines_riscv64.S b/cli/trampolines/trampolines_riscv64.S new file mode 100644 index 0000000000000..26307b7c2bb36 --- /dev/null +++ b/cli/trampolines/trampolines_riscv64.S @@ -0,0 +1,20 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +#include "common.h" +#include "../../src/jl_exported_funcs.inc" + +#define SEP ; + +#define XX(name) \ +.global CNAME(name) SEP \ +.cfi_startproc SEP \ +.p2align 2 SEP \ + CNAME(name)##: SEP \ + auipc t3, %pcrel_hi(CNAMEADDR(name)) SEP \ + ld t3, %pcrel_lo(CNAME(name))(t3) SEP \ + jr t3 SEP \ +.cfi_endproc SEP \ + +JL_RUNTIME_EXPORTED_FUNCS(XX) +JL_CODEGEN_EXPORTED_FUNCS(XX) +#undef XX diff --git a/contrib/generate_precompile.jl b/contrib/generate_precompile.jl index 60f7290c7a0ac..04d13011d6223 100644 --- a/contrib/generate_precompile.jl +++ b/contrib/generate_precompile.jl @@ -202,12 +202,15 @@ if Artifacts !== nothing using Artifacts, Base.BinaryPlatforms, Libdl artifacts_toml = abspath(joinpath(Sys.STDLIB, "Artifacts", "test", "Artifacts.toml")) artifact_hash("HelloWorldC", artifacts_toml) - oldpwd = pwd(); cd(dirname(artifacts_toml)) - macroexpand(Main, :(@artifact_str("HelloWorldC"))) - cd(oldpwd) artifacts = Artifacts.load_artifacts_toml(artifacts_toml) platforms = [Artifacts.unpack_platform(e, "HelloWorldC", artifacts_toml) for e in artifacts["HelloWorldC"]] best_platform = select_platform(Dict(p => triplet(p) for p in platforms)) + if best_platform !== nothing + # @artifact errors for unsupported platforms + oldpwd = pwd(); cd(dirname(artifacts_toml)) + macroexpand(Main, :(@artifact_str("HelloWorldC"))) + cd(oldpwd) + end dlopen("libjulia$(Base.isdebugbuild() ? "-debug" : "")", RTLD_LAZY | RTLD_DEEPBIND) """ end diff --git a/contrib/normalize_triplet.py b/contrib/normalize_triplet.py index b1bab29487b8f..833b725480996 100755 --- a/contrib/normalize_triplet.py +++ b/contrib/normalize_triplet.py @@ -14,6 +14,7 @@ 'i686': "i\\d86", 'aarch64': "(arm|aarch)64", 'armv7l': "arm(v7l)?", + 'riscv64': "(rv64|riscv64)", 'powerpc64le': "p(ower)?pc64le", } platform_mapping = { diff --git a/doc/src/devdocs/build/build.md b/doc/src/devdocs/build/build.md index 0ef9ce4e4f071..553f7c2e815cf 100644 --- a/doc/src/devdocs/build/build.md +++ b/doc/src/devdocs/build/build.md @@ -148,6 +148,7 @@ Notes for various operating systems: Notes for various architectures: * [ARM](https://github.com/JuliaLang/julia/blob/master/doc/src/devdocs/build/arm.md) +* [RISC-V](https://github.com/JuliaLang/julia/blob/master/doc/src/devdocs/build/riscv.md) ## Required Build Tools and External Libraries diff --git a/doc/src/devdocs/build/riscv.md b/doc/src/devdocs/build/riscv.md new file mode 100644 index 0000000000000..7c0e7ab29d9f8 --- /dev/null +++ b/doc/src/devdocs/build/riscv.md @@ -0,0 +1,103 @@ +# RISC-V (Linux) + +Julia has experimental support for 64-bit RISC-V (RV64) processors running +Linux. This file provides general guidelines for compilation, in addition to +instructions for specific devices. + +A list of [known issues](https://github.com/JuliaLang/julia/labels/system:riscv) +for RISC-V is available. If you encounter difficulties, please create an issue +including the output from `cat /proc/cpuinfo`. + + +## Compiling Julia + +For now, Julia will need to be compiled entirely from source, i.e., including +all of its dependencies. This can be accomplished with the following +`Make.user`: + +```make +USE_BINARYBUILDER := 0 +``` + +Additionally, it is required to indicate what architecture, and optionally which +CPU to build for. This can be done by setting the `MARCH` and `MCPU` variables +in `Make.user` + +The `MARCH` variable needs to be set to a RISC-V ISA string, which can be found by +looking at the documentation of your device, or by inspecting `/proc/cpuinfo`. Only +use flags that your compiler supports, e.g., run `gcc -march=help` to see a list of +supported flags. A common value is `rv64gc`, which is a good starting point. + +The `MCPU` variable is optional, and can be used to further optimize the +generated code for a specific CPU. If you are unsure, it is recommended to leave +it unset. You can find a list of supported values by running `gcc --target-help`. + +For example, if you are using a StarFive VisionFive2, which contains a JH7110 +processor based on the SiFive U74, you can set these flags as follows: + +```make +MARCH := rv64gc_zba_zbb +MCPU := sifive-u74 +``` + +If you prefer a portable build, you could use: + +```make +MARCH := rv64gc + +# also set JULIA_CPU_TARGET to the expanded form of rv64gc +# (it normally copies the value of MCPU, which we don't set) +JULIA_CPU_TARGET := generic-rv64,i,m,a,f,d,zicsr,zifencei,c +``` + +### Cross-compilation + +A native build on a RISC-V device may take a very long time, so it's also +possible to cross-compile Julia on a faster machine. + +First, get a hold of a RISC-V cross-compilation toolchain that provides +support for C, C++ and Fortran. This can be done by checking-out the +[riscv-gnu-toolchain](https://github.com/riscv-collab/riscv-gnu-toolchain) +repository and building it as follows: + +```sh +sudo mkdir /opt/riscv && sudo chown $USER /opt/riscv +./configure --prefix=/opt/riscv --with-languages=c,c++,fortran +make linux -j$(nproc) +``` + +Then, install the QEMU user-mode emulator for RISC-V, along with `binfmt` +support to enable execution of RISC-V binaries on the host machine. The +exact steps depend on your distribution, e.g., on Arch Linux it involves +installing the `qemu-user-static` and `qemu-user-static-binfmt` packages. +Note that to actually execute RISC-V binaries, QEMU will need to be able to +find the RISC-V system root, which can be achieved by setting the +`QEMU_LD_PREFIX` environment variable to the path of the root filesystem. + +Finally, compile Julia with the following `Make.user` variables (in addition to +the ones from the previous section): + +```make +XC_HOST=riscv64-unknown-linux-gnu +OS=Linux +export QEMU_LD_PREFIX=/opt/riscv/sysroot +``` + +Note that you will have to execute `make` with `PATH` set to include the +cross-compilation toolchain, e.g., by running: + +```sh +PATH=/opt/riscv/bin:$PATH make -j$(nproc) +``` + +Because of the RISC-V sysroot we use being very barren, you may need to +add additional libraries that the Julia build system currently expects +to be available system-wide. For example, the build currently relies on +a system-provided `libz`, so you may need to copy this library from the +Julia build into the system root: + +```sh +make -C deps install-zlib +cp -v usr/lib/libz.* /opt/riscv/sysroot/usr/lib +cp -v usr/include/z*.h /opt/riscv/sysroot/usr/include +``` diff --git a/src/abi_riscv.cpp b/src/abi_riscv.cpp new file mode 100644 index 0000000000000..cbd85892801c8 --- /dev/null +++ b/src/abi_riscv.cpp @@ -0,0 +1,315 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +//===----------------------------------------------------------------------===// +// +// The ABI implementation used for RISC-V targets. +// +//===----------------------------------------------------------------------===// +// +// The Procedure Call Standard can be found here: +// https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-cc.adoc +// +// This code is based on: +// - The Rust implementation: +// https://github.com/rust-lang/rust/blob/master/compiler/rustc_target/src/abi/call/riscv.rs +// - The LLVM RISC-V backend: +// https://github.com/llvm/llvm-project/blob/78533528cf5ed04ac78722afff7c9f2f91aa8359/llvm/lib/Target/RISCV/RISCVISelLowering.cpp#L10865 +// +//===----------------------------------------------------------------------===// + + +struct ABI_RiscvLayout : AbiLayout { + +static const size_t XLen = 8; +static const size_t FLen = 8; +static const int NumArgGPRs = 8; +static const int NumArgFPRs = 8; + +// available register num is needed to determine if fp pair or int-fp pair in a struct should be unpacked +// WARN: with this, use_sret must only be called once before the next +// needPassByRef call, otherwise avail_gprs is wrong +int avail_gprs, avail_fprs; + +// preferred type is determined in the same time of use_sret & needPassByRef +// cache it here to avoid computing it again in preferred_llvm_type +Type *cached_llvmtype = NULL; + +ABI_RiscvLayout() : avail_gprs(NumArgGPRs), avail_fprs(NumArgFPRs) {} + +enum RegPassKind { UNKNOWN = 0, INTEGER = 1, FLOAT = 2 }; + +struct ElementType { + RegPassKind type; + jl_datatype_t *dt; + ElementType() : type(RegPassKind::UNKNOWN), dt(NULL) {}; +}; + +bool is_floattype(jl_datatype_t *dt) const +{ + return dt == jl_float16_type || dt == jl_float32_type || dt == jl_float64_type; +} + +Type *get_llvm_fptype(jl_datatype_t *dt, LLVMContext &ctx) const +{ + assert(is_floattype(dt)); + switch (jl_datatype_size(dt)) { + case 2: return Type::getHalfTy(ctx); + case 4: return Type::getFloatTy(ctx); + case 8: return Type::getDoubleTy(ctx); + case 16: return Type::getFP128Ty(ctx); + default: assert(0 && "abi_riscv: unsupported floating point type"); return NULL; + } +} + +// for primitive types that can be passed as integer +// includes integer, bittypes, pointer +Type *get_llvm_inttype(jl_datatype_t *dt, LLVMContext &ctx) const +{ + assert(jl_is_primitivetype(dt)); + // XXX: without Zfh, Float16 is passed in integer registers + if (dt == jl_float16_type) + return Type::getInt32Ty(ctx); + assert(!is_floattype(dt)); + if (dt == jl_bool_type) + return getInt8Ty(ctx); + if (dt == jl_int32_type) + return getInt32Ty(ctx); + if (dt == jl_int64_type) + return getInt64Ty(ctx); + int nb = jl_datatype_size(dt); + return Type::getIntNTy(ctx, nb * 8); +} + +bool should_use_fp_conv(jl_datatype_t *dt, ElementType &ele1, ElementType &ele2) const +{ + if (jl_is_primitivetype(dt)) { + size_t dsz = jl_datatype_size(dt); + if (dsz > FLen) { + return false; + } + if (is_floattype(dt)) { + if (ele1.type == RegPassKind::UNKNOWN) { + ele1.type = RegPassKind::FLOAT; + ele1.dt = dt; + } + else if (ele2.type == RegPassKind::UNKNOWN) { + ele2.type = RegPassKind::FLOAT; + ele2.dt = dt; + } + else { + // 3 elements not eligible, must be a pair + return false; + } + } + // integer or pointer type or bitstypes + else { + if (ele1.type == RegPassKind::UNKNOWN) { + ele1.type = RegPassKind::INTEGER; + ele1.dt = dt; + } + else if (ele1.type == RegPassKind::INTEGER) { + // two integers not eligible + return false; + } + // ele1.type == RegPassKind::FLOAT + else { + if (ele2.type == RegPassKind::UNKNOWN) { + ele2.type = RegPassKind::INTEGER; + ele2.dt = dt; + } + else { + // 3 elements not eligible, must be a pair + return false; + } + } + } + } + else { // aggregates + while (size_t nfields = jl_datatype_nfields(dt)) { + size_t i; + size_t fieldsz; + for (i = 0; i < nfields; i++) { + if ((fieldsz = jl_field_size(dt, i))) { + break; + } + } + assert(i < nfields); + // If there's only one non zero sized member, try again on this member + if (fieldsz == jl_datatype_size(dt)) { + dt = (jl_datatype_t *)jl_field_type(dt, i); + if (!jl_is_datatype(dt)) // could be inline union #46787 + return false; + continue; + } + for (; i < nfields; i++) { + size_t fieldsz = jl_field_size(dt, i); + if (fieldsz == 0) + continue; + jl_datatype_t *fieldtype = (jl_datatype_t *)jl_field_type(dt, i); + if (!jl_is_datatype(dt)) // could be inline union + return false; + // This needs to be done after the zero size member check + if (ele2.type != RegPassKind::UNKNOWN) { + // we already have a pair and can't accept more elements + return false; + } + if (!should_use_fp_conv(fieldtype, ele1, ele2)) { + return false; + } + } + break; + } + } + // Tuple{Int,} can reach here as well, but doesn't really hurt + return true; +} + +Type *get_llvm_inttype_byxlen(size_t xlen, LLVMContext &ctx) const +{ + if (xlen == 8) { + return getInt64Ty(ctx); + } + else if (xlen == 4) { + return getInt32Ty(ctx); + } + else { + assert(0 && "abi_riscv: unsupported xlen"); + return NULL; + } +} + +Type *classify_arg(jl_datatype_t *ty, int &avail_gprs, int &avail_fprs, bool &onstack, + LLVMContext &ctx) const +{ + onstack = false; + if (ty == jl_nothing_type) { + return NULL; + } + ElementType ele1, ele2; + if (should_use_fp_conv(ty, ele1, ele2)) { + if (ele1.type == RegPassKind::FLOAT) { + if (ele2.type == RegPassKind::FLOAT) { + if (avail_fprs >= 2) { + avail_fprs -= 2; + SmallVector eles; + eles.push_back(get_llvm_fptype(ele1.dt, ctx)); + eles.push_back(get_llvm_fptype(ele2.dt, ctx)); + return StructType::get(ctx, eles); + } + } + else if (ele2.type == RegPassKind::INTEGER) { + if (avail_fprs >= 1 && avail_gprs >= 1) { + avail_fprs -= 1; + avail_gprs -= 1; + SmallVector eles; + eles.push_back(get_llvm_fptype(ele1.dt, ctx)); + eles.push_back(get_llvm_inttype(ele2.dt, ctx)); + return StructType::get(ctx, eles); + } + } + else { + // A struct containing just one floating-point real is passed + // as though it were a standalone floating-point real. + if (avail_fprs >= 1) { + avail_fprs -= 1; + return get_llvm_fptype(ele1.dt, ctx); + } + } + } + else if (ele1.type == RegPassKind::INTEGER) { + if (ele2.type == RegPassKind::FLOAT) { + if (avail_fprs >= 1 && avail_gprs >= 1) { + avail_fprs -= 1; + avail_gprs -= 1; + return StructType::get(get_llvm_inttype(ele1.dt, ctx), + get_llvm_fptype(ele2.dt, ctx)); + } + } + } + } + size_t dsz = jl_datatype_size(ty); + if (dsz > 2 * XLen) { + if (!jl_is_primitivetype(ty)) { + onstack = true; + } + // else let llvm backend handle scalars + if (avail_gprs >= 1) { + avail_gprs -= 1; + } + return NULL; + } + + if (dsz > XLen) { + size_t alignment = jl_datatype_align(ty); + bool align_regs = alignment > XLen; + if (avail_gprs >= 2) { + avail_gprs -= 2; + } + // should we handle variadic as well? + // Variadic arguments with 2×XLEN-bit alignment and size at most 2×XLEN + // bits are passed in an aligned register pair + else { + avail_gprs = 0; + } + + if (!jl_is_primitivetype(ty)) { + // Aggregates or scalars passed on the stack are aligned to the + // greater of the type alignment and XLen bits, but never more than + // the stack alignment. + if (align_regs) { + if (alignment == 16) { + return Type::getInt128Ty(ctx); + } + else { + return Type::getInt64Ty(ctx); + } + } + else { + return ArrayType::get(get_llvm_inttype_byxlen(XLen, ctx), 2); + } + } + // let llvm backend handle scalars + return NULL; + } + + //else dsz <= XLen + if (avail_gprs >= 1) { + avail_gprs -= 1; + } + if (!jl_is_primitivetype(ty)) { + return get_llvm_inttype_byxlen(XLen, ctx); + } + return get_llvm_inttype(ty, ctx); +} + +bool use_sret(jl_datatype_t *ty, LLVMContext &ctx) override +{ + bool onstack = false; + int gprs = 2; + int fprs = FLen ? 2 : 0; + this->cached_llvmtype = classify_arg(ty, gprs, fprs, onstack, ctx); + if (onstack) { + this->avail_gprs -= 1; + return true; + } + else { + return false; + } +} + +bool needPassByRef(jl_datatype_t *ty, AttrBuilder &ab, LLVMContext &ctx, + Type *Ty) override +{ + bool onstack = false; + this->cached_llvmtype = + classify_arg(ty, this->avail_gprs, this->avail_fprs, onstack, ctx); + return onstack; +} + +Type *preferred_llvm_type(jl_datatype_t *ty, bool isret, + LLVMContext &ctx) const override +{ + return this->cached_llvmtype; +} + +}; diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index c2f112f9c9d5c..279686c387e1b 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -1664,7 +1664,8 @@ void jl_dump_native_impl(void *native_code, } CodeModel::Model CMModel = CodeModel::Small; - if (TheTriple.isPPC() || (TheTriple.isX86() && TheTriple.isArch64Bit() && TheTriple.isOSLinux())) { + if (TheTriple.isPPC() || TheTriple.isRISCV() || + (TheTriple.isX86() && TheTriple.isArch64Bit() && TheTriple.isOSLinux())) { // On PPC the small model is limited to 16bit offsets. For very large images the small code model CMModel = CodeModel::Medium; // isn't good enough on x86 so use Medium, it has no cost because only the image goes in .ldata } diff --git a/src/ccall.cpp b/src/ccall.cpp index 2de5be6906e7c..f559ddbe93a43 100644 --- a/src/ccall.cpp +++ b/src/ccall.cpp @@ -367,6 +367,7 @@ static bool is_native_simd_type(jl_datatype_t *dt) { #include "abi_arm.cpp" #include "abi_aarch64.cpp" +#include "abi_riscv.cpp" #include "abi_ppc64le.cpp" #include "abi_win32.cpp" #include "abi_win64.cpp" @@ -391,6 +392,8 @@ static bool is_native_simd_type(jl_datatype_t *dt) { typedef ABI_ARMLayout DefaultAbiState; #elif defined _CPU_AARCH64_ typedef ABI_AArch64Layout DefaultAbiState; +#elif defined _CPU_RISCV64_ + typedef ABI_RiscvLayout DefaultAbiState; #elif defined _CPU_PPC64_ typedef ABI_PPC64leLayout DefaultAbiState; #else diff --git a/src/codegen.cpp b/src/codegen.cpp index bcda527416676..ca38ae8ddb288 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -5368,7 +5368,7 @@ static jl_cgval_t emit_call_specfun_other(jl_codectx_t &ctx, bool is_opaque_clos } CallInst *call = ctx.builder.CreateCall(cft, TheCallee, argvals); call->setAttributes(returninfo.attrs); - if (gcstack_arg) + if (gcstack_arg && !ctx.emission_context.TargetTriple.isRISCV()) call->setCallingConv(CallingConv::Swift); jl_cgval_t retval; @@ -8186,7 +8186,8 @@ static jl_returninfo_t get_specsig_function(jl_codectx_t &ctx, Module *M, Value if (gcstack_arg){ AttrBuilder param(ctx.builder.getContext()); - param.addAttribute(Attribute::SwiftSelf); + if (!ctx.emission_context.TargetTriple.isRISCV()) + param.addAttribute(Attribute::SwiftSelf); param.addAttribute(Attribute::NonNull); attrs.push_back(AttributeSet::get(ctx.builder.getContext(), param)); fsig.push_back(PointerType::get(JuliaType::get_ppjlvalue_ty(ctx.builder.getContext()), 0)); @@ -8278,7 +8279,7 @@ static jl_returninfo_t get_specsig_function(jl_codectx_t &ctx, Module *M, Value fval = emit_inttoptr(ctx, fval, ftype->getPointerTo()); } if (auto F = dyn_cast(fval)) { - if (gcstack_arg) + if (gcstack_arg && !ctx.emission_context.TargetTriple.isRISCV()) F->setCallingConv(CallingConv::Swift); assert(F->arg_size() >= argnames.size()); for (size_t i = 0; i < argnames.size(); i++) { diff --git a/src/disasm.cpp b/src/disasm.cpp index ebe8f2ac397c0..b944e48430c29 100644 --- a/src/disasm.cpp +++ b/src/disasm.cpp @@ -1058,6 +1058,8 @@ static void jl_dump_asm_internal( if (insSize == 0) // skip illegible bytes #if defined(_CPU_PPC_) || defined(_CPU_PPC64_) || defined(_CPU_ARM_) || defined(_CPU_AARCH64_) insSize = 4; // instructions are always 4 bytes +#elif defined(_CPU_RISCV64_) + insSize = 2; // instructions can be 2 bytes when compressed #else insSize = 1; // attempt to slide 1 byte forward #endif diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp index 4ff7400df13dd..313449dda5557 100644 --- a/src/jitlayers.cpp +++ b/src/jitlayers.cpp @@ -998,6 +998,16 @@ namespace { #if defined(MSAN_EMUTLS_WORKAROUND) options.EmulatedTLS = true; options.ExplicitEmulatedTLS = true; +#endif +#if defined(_CPU_RISCV64_) + // we set these manually to avoid LLVM defaulting to soft-float +#if defined(__riscv_float_abi_double) + options.MCOptions.ABIName = "lp64d"; +#elif defined(__riscv_float_abi_single) + options.MCOptions.ABIName = "lp64f"; +#else + options.MCOptions.ABIName = "lp64"; +#endif #endif uint32_t target_flags = 0; auto target = jl_get_llvm_target(imaging_default(), target_flags); @@ -1042,11 +1052,23 @@ namespace { #endif if (TheTriple.isAArch64()) codemodel = CodeModel::Small; + else if (TheTriple.isRISCV()) { + // RISC-V will support large code model in LLVM 21 + // https://github.com/llvm/llvm-project/pull/70308 + codemodel = CodeModel::Medium; + } + // Generate simpler code for JIT + Reloc::Model relocmodel = Reloc::Static; + if (TheTriple.isRISCV()) { + // until large code model is supported, use PIC for RISC-V + // https://github.com/llvm/llvm-project/issues/106203 + relocmodel = Reloc::PIC_; + } auto optlevel = CodeGenOptLevelFor(jl_options.opt_level); auto TM = TheTarget->createTargetMachine( TheTriple.getTriple(), TheCPU, FeaturesStr, options, - Reloc::Static, // Generate simpler code for JIT + relocmodel, codemodel, optlevel, true // JIT @@ -1067,7 +1089,7 @@ namespace { .setCPU(TM.getTargetCPU().str()) .setFeatures(TM.getTargetFeatureString()) .setOptions(TM.Options) - .setRelocationModel(Reloc::Static) + .setRelocationModel(TM.getRelocationModel()) .setCodeModel(TM.getCodeModel()) .setCodeGenOptLevel(CodeGenOptLevelFor(optlevel)); } diff --git a/src/jitlayers.h b/src/jitlayers.h index 3353a4093bd27..47ab369f1e24a 100644 --- a/src/jitlayers.h +++ b/src/jitlayers.h @@ -58,6 +58,10 @@ # define JL_USE_JITLINK #endif +#if defined(_CPU_RISCV64_) +# define JL_USE_JITLINK +#endif + # include # include # include diff --git a/src/julia_internal.h b/src/julia_internal.h index 20d90fede3d5e..c09bfc5c3eb42 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -105,8 +105,8 @@ JL_DLLIMPORT void __tsan_switch_to_fiber(void *fiber, unsigned flags); #ifndef _OS_WINDOWS_ #if defined(_CPU_ARM_) || defined(_CPU_PPC_) || defined(_CPU_WASM_) #define MAX_ALIGN 8 - #elif defined(_CPU_AARCH64_) || (JL_LLVM_VERSION >= 180000 && (defined(_CPU_X86_64_) || defined(_CPU_X86_))) - // int128 is 16 bytes aligned on aarch64 and on x86 with LLVM >= 18 + #elif defined(_CPU_AARCH64_) || defined(_CPU_RISCV64_) || (JL_LLVM_VERSION >= 180000 && (defined(_CPU_X86_64_) || defined(_CPU_X86_))) + // int128 is 16 bytes aligned on aarch64 and riscv, and on x86 with LLVM >= 18 #define MAX_ALIGN 16 #elif defined(_P64) // Generically we assume MAX_ALIGN is sizeof(void*) @@ -259,6 +259,11 @@ static inline uint64_t cycleclock(void) JL_NOTSAFEPOINT struct timeval tv; gettimeofday(&tv, NULL); return (int64_t)(tv.tv_sec) * 1000000 + tv.tv_usec; +#elif defined(_CPU_RISCV64_) + // taken from https://github.com/google/benchmark/blob/3b3de69400164013199ea448f051d94d7fc7d81f/src/cycleclock.h#L190 + uint64_t ret; + __asm__ volatile("rdcycle %0" : "=r"(ret)); + return ret; #elif defined(_CPU_PPC64_) // This returns a time-base, which is not always precisely a cycle-count. // https://reviews.llvm.org/D78084 diff --git a/src/julia_threads.h b/src/julia_threads.h index b697a0bf030ed..17e8d7d466044 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -56,7 +56,7 @@ typedef struct { !defined(JL_HAVE_ASM) && \ !defined(JL_HAVE_UNW_CONTEXT) #if (defined(_CPU_X86_64_) || defined(_CPU_X86_) || defined(_CPU_AARCH64_) || \ - defined(_CPU_ARM_) || defined(_CPU_PPC64_)) + defined(_CPU_ARM_) || defined(_CPU_PPC64_) || defined(_CPU_RISCV64_)) #define JL_HAVE_ASM #endif #if 0 diff --git a/src/llvm-ptls.cpp b/src/llvm-ptls.cpp index 488dd46cade21..614ed15f840e6 100644 --- a/src/llvm-ptls.cpp +++ b/src/llvm-ptls.cpp @@ -117,6 +117,8 @@ Instruction *LowerPTLS::emit_pgcstack_tp(Value *offset, Instruction *insertBefor asm_str = "mrs $0, tpidr_el0"; } else if (TargetTriple.isARM()) { asm_str = "mrc p15, 0, $0, c13, c0, 3"; + } else if (TargetTriple.isRISCV()) { + asm_str = "mv $0, tp"; } else if (TargetTriple.getArch() == Triple::x86_64) { asm_str = "movq %fs:0, $0"; } else if (TargetTriple.getArch() == Triple::x86) { diff --git a/src/llvm-version.h b/src/llvm-version.h index 2a38bb7c488b8..984e918d480cc 100644 --- a/src/llvm-version.h +++ b/src/llvm-version.h @@ -18,6 +18,10 @@ #define JL_LLVM_OPAQUE_POINTERS 1 #endif +#if JL_LLVM_VERSION < 19000 && defined(_CPU_RISCV64_) + #error Only LLVM versions >= 19.0.0 are supported by Julia on RISC-V +#endif + #ifdef __cplusplus #if defined(__GNUC__) && (__GNUC__ >= 9) // Added in GCC 9, this warning is annoying diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c index db4007d32035e..450096eef5b01 100644 --- a/src/runtime_intrinsics.c +++ b/src/runtime_intrinsics.c @@ -256,7 +256,7 @@ JL_DLLEXPORT float julia_half_to_float(uint16_t param) { #if ((defined(__GNUC__) && __GNUC__ > 11) || \ (defined(__clang__) && __clang_major__ > 14)) && \ !defined(_CPU_PPC64_) && !defined(_CPU_PPC_) && \ - !defined(_OS_WINDOWS_) + !defined(_OS_WINDOWS_) && !defined(_CPU_RISCV64_) #define FLOAT16_TYPE _Float16 #define FLOAT16_TO_UINT16(x) (*(uint16_t*)&(x)) #define FLOAT16_FROM_UINT16(x) (*(_Float16*)&(x)) @@ -355,7 +355,7 @@ float julia_bfloat_to_float(uint16_t param) { #if ((defined(__GNUC__) && __GNUC__ > 12) || \ (defined(__clang__) && __clang_major__ > 16)) && \ !defined(_CPU_PPC64_) && !defined(_CPU_PPC_) && \ - !defined(_OS_WINDOWS_) + !defined(_OS_WINDOWS_) && !defined(_CPU_RISCV64_) #define BFLOAT16_TYPE __bf16 #define BFLOAT16_TO_UINT16(x) (*(uint16_t*)&(x)) #define BFLOAT16_FROM_UINT16(x) (*(__bf16*)&(x)) diff --git a/src/signal-handling.c b/src/signal-handling.c index d7f4697a3c4f0..ce7e8ba57af19 100644 --- a/src/signal-handling.c +++ b/src/signal-handling.c @@ -256,7 +256,8 @@ static uintptr_t jl_get_pc_from_ctx(const void *_ctx); void jl_show_sigill(void *_ctx); #if defined(_CPU_X86_64_) || defined(_CPU_X86_) \ || (defined(_OS_LINUX_) && defined(_CPU_AARCH64_)) \ - || (defined(_OS_LINUX_) && defined(_CPU_ARM_)) + || (defined(_OS_LINUX_) && defined(_CPU_ARM_)) \ + || (defined(_OS_LINUX_) && defined(_CPU_RISCV64_)) static size_t jl_safe_read_mem(const volatile char *ptr, char *out, size_t len) { jl_jmp_buf *old_buf = jl_get_safe_restore(); @@ -344,6 +345,8 @@ static uintptr_t jl_get_pc_from_ctx(const void *_ctx) return ((ucontext_t*)_ctx)->uc_mcontext.mc_gpregs.gp_elr; #elif defined(_OS_LINUX_) && defined(_CPU_ARM_) return ((ucontext_t*)_ctx)->uc_mcontext.arm_pc; +#elif defined(_OS_LINUX_) && defined(_CPU_RISCV64_) + return ((ucontext_t*)_ctx)->uc_mcontext.__gregs[REG_PC]; #else // TODO for PPC return 0; @@ -421,6 +424,20 @@ void jl_show_sigill(void *_ctx) jl_safe_printf("Invalid ARM instruction at %p: 0x%08" PRIx32 "\n", (void*)pc, inst); } } +#elif defined(_OS_LINUX_) && defined(_CPU_RISCV64_) + uint32_t inst = 0; + size_t len = jl_safe_read_mem(pc, (char*)&inst, 4); + if (len < 2) + jl_safe_printf("Fault when reading instruction: %d bytes read\n", (int)len); + if (inst == 0x00100073 || // ebreak + inst == 0xc0001073 || // unimp (pseudo-instruction for illegal `csrrw x0, cycle, x0`) + (inst & ((1 << 16) - 1)) == 0x0000) { // c.unimp (compressed form) + // The signal might actually be SIGTRAP instead, doesn't hurt to handle it here though. + jl_safe_printf("Unreachable reached at %p\n", pc); + } + else { + jl_safe_printf("Invalid instruction at %p: 0x%08" PRIx32 "\n", pc, inst); + } #else // TODO for PPC (void)_ctx; diff --git a/src/signals-unix.c b/src/signals-unix.c index f99eca31730b6..caf0e977929c5 100644 --- a/src/signals-unix.c +++ b/src/signals-unix.c @@ -80,6 +80,9 @@ static inline uintptr_t jl_get_rsp_from_ctx(const void *_ctx) #elif defined(_OS_LINUX_) && defined(_CPU_ARM_) const ucontext_t *ctx = (const ucontext_t*)_ctx; return ctx->uc_mcontext.arm_sp; +#elif defined(_OS_LINUX_) && (defined(_CPU_RISCV64_)) + const ucontext_t *ctx = (const ucontext_t*)_ctx; + return ctx->uc_mcontext.__gregs[REG_SP]; #elif defined(_OS_FREEBSD_) && defined(_CPU_X86_64_) const ucontext_t *ctx = (const ucontext_t*)_ctx; return ctx->uc_mcontext.mc_rsp; @@ -175,6 +178,11 @@ JL_NO_ASAN static void jl_call_in_ctx(jl_ptls_t ptls, void (*fptr)(void), int si ctx->uc_mcontext.arm_sp = rsp; ctx->uc_mcontext.arm_lr = 0; // Clear link register ctx->uc_mcontext.arm_pc = target; +#elif defined(_OS_LINUX_) && (defined(_CPU_RISCV64_)) + ucontext_t *ctx = (ucontext_t*)_ctx; + ctx->uc_mcontext.__gregs[REG_SP] = rsp; + ctx->uc_mcontext.__gregs[REG_RA] = 0; // Clear return address address (ra) + ctx->uc_mcontext.__gregs[REG_PC] = (uintptr_t)fptr; #else #pragma message("julia: throw-in-context not supported on this platform") // TODO Add support for PowerPC(64)? diff --git a/src/stackwalk.c b/src/stackwalk.c index 5377d091cb780..460bfafb97d2b 100644 --- a/src/stackwalk.c +++ b/src/stackwalk.c @@ -1066,6 +1066,45 @@ int jl_simulate_longjmp(jl_jmp_buf mctx, bt_context_t *c) JL_NOTSAFEPOINT mc->regs[0] = 1; assert(mc->sp % 16 == 0); return 1; + #elif defined(_CPU_RISCV64_) + // https://github.com/bminor/glibc/blob/master/sysdeps/riscv/bits/setjmp.h + // https://github.com/llvm/llvm-project/blob/7714e0317520207572168388f22012dd9e152e9e/libunwind/src/Registers.hpp -> Registers_riscv + // https://github.com/llvm/llvm-project/blob/90149204bd08c07eb672cd5b19d782fed3d96ddc/libunwind/include/libunwind.h + mc->__gregs[1] = (*_ctx)->__pc; // ra + mc->__gregs[8] = (*_ctx)->__regs[0]; // s0 + mc->__gregs[9] = (*_ctx)->__regs[1]; // s1 + mc->__gregs[18] = (*_ctx)->__regs[2]; // s2 + mc->__gregs[19] = (*_ctx)->__regs[3]; // s3 + mc->__gregs[20] = (*_ctx)->__regs[4]; // s4 + mc->__gregs[21] = (*_ctx)->__regs[5]; // s5 + mc->__gregs[22] = (*_ctx)->__regs[6]; // s6 + mc->__gregs[23] = (*_ctx)->__regs[7]; // s7 + mc->__gregs[24] = (*_ctx)->__regs[8]; // s8 + mc->__gregs[25] = (*_ctx)->__regs[9]; // s9 + mc->__gregs[26] = (*_ctx)->__regs[10]; // s10 + mc->__gregs[27] = (*_ctx)->__regs[11]; // s11 + mc->__gregs[2] = (*_ctx)->__sp; // sp + #ifndef __riscv_float_abi_soft + mc->__fpregs.__q.__f[40] = (*_ctx)->__fpregs[0]; // fs0 + mc->__fpregs.__q.__f[41] = (*_ctx)->__fpregs[1]; // fs1 + mc->__fpregs.__q.__f[50] = (*_ctx)->__fpregs[2]; // fs2 + mc->__fpregs.__q.__f[51] = (*_ctx)->__fpregs[3]; // fs3 + mc->__fpregs.__q.__f[52] = (*_ctx)->__fpregs[4]; // fs4 + mc->__fpregs.__q.__f[53] = (*_ctx)->__fpregs[5]; // fs5 + mc->__fpregs.__q.__f[54] = (*_ctx)->__fpregs[6]; // fs6 + mc->__fpregs.__q.__f[55] = (*_ctx)->__fpregs[7]; // fs7 + mc->__fpregs.__q.__f[56] = (*_ctx)->__fpregs[8]; // fs8 + mc->__fpregs.__q.__f[57] = (*_ctx)->__fpregs[9]; // fs9 + mc->__fpregs.__q.__f[58] = (*_ctx)->__fpregs[10]; // fs10 + mc->__fpregs.__q.__f[59] = (*_ctx)->__fpregs[11]; // fs11 + #endif + // ifdef PTR_DEMANGLE ? + mc->__gregs[REG_SP] = ptr_demangle(mc->__gregs[REG_SP]); + mc->__gregs[REG_RA] = ptr_demangle(mc->__gregs[REG_RA]); + mc->__gregs[REG_PC] = mc->__gregs[REG_RA]; + mc->__gregs[REG_A0] = 1; + assert(mc->__gregs[REG_SP] % 16 == 0); + return 1; #else #pragma message("jl_record_backtrace not defined for ASM/SETJMP on unknown linux") (void)mc; diff --git a/src/support/platform.h b/src/support/platform.h index a0dd84c9c20b6..816e2090b5a08 100644 --- a/src/support/platform.h +++ b/src/support/platform.h @@ -27,6 +27,7 @@ * _CPU_X86_64_ * _CPU_AARCH64_ * _CPU_ARM_ + * _CPU_RISCV64_ * _CPU_WASM_ */ @@ -106,6 +107,8 @@ #define _CPU_AARCH64_ #elif defined(__arm__) || defined(_M_ARM) #define _CPU_ARM_ +#elif defined(__riscv) && __riscv_xlen == 64 +#define _CPU_RISCV64_ #elif defined(__PPC64__) #define _CPU_PPC64_ #elif defined(_ARCH_PPC) diff --git a/src/task.c b/src/task.c index f86e0ab3a880d..be2631347e82e 100644 --- a/src/task.c +++ b/src/task.c @@ -1491,6 +1491,14 @@ CFI_NORETURN // because all our addresses are word-aligned. " udf #0" // abort : : "r" (stk), "r"(fn) : "memory" ); +#elif defined(_CPU_RISCV64_) + asm volatile( + " mv sp, %0;\n" + " mv ra, zero;\n" // Clear return address register + " mv fp, zero;\n" // Clear frame pointer + " jr %1;\n" // call `fn` with fake stack frame + " ebreak" // abort + : : "r"(stk), "r"(fn) : "memory" ); #elif defined(_CPU_PPC64_) // N.B.: There is two iterations of the PPC64 ABI. // v2 is current and used here. Make sure you have the diff --git a/src/threading.c b/src/threading.c index c26028d2f3da2..50944a24eb29b 100644 --- a/src/threading.c +++ b/src/threading.c @@ -18,7 +18,7 @@ // For variant 1 JL_ELF_TLS_INIT_SIZE is the size of the thread control block (TCB) // For variant 2 JL_ELF_TLS_INIT_SIZE is 0 #if defined(_OS_LINUX_) || defined(_OS_FREEBSD_) -# if defined(_CPU_X86_64_) || defined(_CPU_X86_) +# if defined(_CPU_X86_64_) || defined(_CPU_X86_) || defined(_CPU_RISCV64_) # define JL_ELF_TLS_VARIANT 2 # define JL_ELF_TLS_INIT_SIZE 0 # elif defined(_CPU_AARCH64_) @@ -638,6 +638,8 @@ static void jl_check_tls(void) asm("mrs %0, tpidr_el0" : "=r"(tp)); #elif defined(__ARM_ARCH) && __ARM_ARCH >= 7 asm("mrc p15, 0, %0, c13, c0, 3" : "=r"(tp)); +#elif defined(_CPU_RISCV64_) + asm("mv %0, tp" : "=r"(tp)); #else # error "Cannot emit thread pointer for this architecture." #endif