From 25092a398299abe2075b9333bb451f9cd480ac83 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 11 Oct 2024 10:41:47 +0200
Subject: [PATCH] Initial RISC-V support.

Co-authored-by: Alex Fan <alex.fan.q@gmail.com>
---
 Make.inc                              |  13 +-
 base/binaryplatforms.jl               |   5 +-
 base/cpuid.jl                         |   3 +
 cli/trampolines/trampolines_riscv64.S |  20 ++
 contrib/generate_precompile.jl        |   9 +-
 contrib/normalize_triplet.py          |   1 +
 doc/src/devdocs/build/build.md        |   1 +
 doc/src/devdocs/build/riscv.md        | 103 +++++++++
 src/abi_riscv.cpp                     | 315 ++++++++++++++++++++++++++
 src/aotcompile.cpp                    |   3 +-
 src/ccall.cpp                         |   3 +
 src/codegen.cpp                       |   7 +-
 src/disasm.cpp                        |   2 +
 src/jitlayers.cpp                     |  26 ++-
 src/jitlayers.h                       |   4 +
 src/julia_internal.h                  |   9 +-
 src/julia_threads.h                   |   2 +-
 src/llvm-ptls.cpp                     |   2 +
 src/llvm-version.h                    |   4 +
 src/runtime_intrinsics.c              |   4 +-
 src/signal-handling.c                 |  19 +-
 src/signals-unix.c                    |   8 +
 src/stackwalk.c                       |  39 ++++
 src/support/platform.h                |   3 +
 src/task.c                            |   8 +
 src/threading.c                       |   4 +-
 26 files changed, 599 insertions(+), 18 deletions(-)
 create mode 100644 cli/trampolines/trampolines_riscv64.S
 create mode 100644 doc/src/devdocs/build/riscv.md
 create mode 100644 src/abi_riscv.cpp

diff --git a/Make.inc b/Make.inc
index 53aee8a269732..cb79e3ca1b5a9 100644
--- a/Make.inc
+++ b/Make.inc
@@ -938,8 +938,12 @@ endif
 
 #If nothing is set default to native unless we are cross-compiling
 ifeq ($(MARCH)$(MCPU)$(MTUNE)$(JULIA_CPU_TARGET)$(XC_HOST),)
-ifeq ($(ARCH),aarch64) #ARM recommends only setting MCPU for AArch64
+ifeq ($(ARCH),aarch64)
+# ARM recommends only setting MCPU for AArch64
 MCPU=native
+else ifneq (,$(findstring riscv64,$(ARCH)))
+# RISC-V doesn't have a native option
+$(error Building for RISC-V requires a specific MARCH to be set))
 else
 MARCH=native
 MTUNE=native
@@ -995,6 +999,9 @@ endif
 ifneq (,$(findstring arm,$(ARCH)))
 DIST_ARCH:=arm
 endif
+ifneq (,$(findstring riscv64,$(ARCH)))
+DIST_ARCH:=riscv64
+endif
 
 JULIA_BINARYDIST_FILENAME := julia-$(JULIA_COMMIT)-$(DIST_OS)$(DIST_ARCH)
 endif
@@ -1018,8 +1025,12 @@ ifneq ($(MARCH),)
 CC += -march=$(MARCH)
 CXX += -march=$(MARCH)
 FC += -march=$(MARCH)
+# On RISC-V, don't forward the MARCH ISA string to JULIA_CPU_TARGET,
+# as it's always incompatible with LLVM's CPU target name parser.
+ifeq (,$(findstring riscv64,$(ARCH)))
 JULIA_CPU_TARGET ?= $(MARCH)
 endif
+endif
 
 # Set MCPU-specific flags
 ifneq ($(MCPU),)
diff --git a/base/binaryplatforms.jl b/base/binaryplatforms.jl
index c8a55c99a5724..a372137edeb98 100644
--- a/base/binaryplatforms.jl
+++ b/base/binaryplatforms.jl
@@ -597,7 +597,7 @@ const arch_mapping = Dict(
     "armv7l" => "arm(v7l)?", # if we just see `arm-linux-gnueabihf`, we assume it's `armv7l`
     "armv6l" => "armv6l",
     "powerpc64le" => "p(ower)?pc64le",
-    "riscv64" => "riscv64",
+    "riscv64" => "(rv64|riscv64)",
 )
 # Keep this in sync with `CPUID.ISAs_by_family`
 # These are the CPUID side of the microarchitectures targeted by GCC flags in BinaryBuilder.jl
@@ -631,6 +631,9 @@ const arch_march_isa_mapping = let
             "a64fx" => get_set("aarch64", "a64fx"),
             "apple_m1" => get_set("aarch64", "apple_m1"),
         ],
+        "riscv64" => [
+            "riscv64" => get_set("riscv64", "riscv64")
+        ],
         "powerpc64le" => [
             "power8" => get_set("powerpc64le", "power8"),
         ],
diff --git a/base/cpuid.jl b/base/cpuid.jl
index f653ba27b4bcd..0370bd33b83e5 100644
--- a/base/cpuid.jl
+++ b/base/cpuid.jl
@@ -61,6 +61,9 @@ const ISAs_by_family = Dict(
         "a64fx" => ISA(Set((JL_AArch64_v8_2a, JL_AArch64_lse, JL_AArch64_crc, JL_AArch64_rdm, JL_AArch64_sha2, JL_AArch64_ccpp, JL_AArch64_complxnum, JL_AArch64_fullfp16, JL_AArch64_sve))),
         "apple_m1" => ISA(Set((JL_AArch64_v8_5a, JL_AArch64_lse, JL_AArch64_crc, JL_AArch64_rdm, JL_AArch64_aes, JL_AArch64_sha2, JL_AArch64_sha3, JL_AArch64_ccpp, JL_AArch64_complxnum, JL_AArch64_fp16fml, JL_AArch64_fullfp16, JL_AArch64_dotprod, JL_AArch64_rcpc, JL_AArch64_altnzcv))),
     ],
+    "riscv64" => [
+        "riscv64" => ISA(Set{UInt32}()),
+    ],
     "powerpc64le" => [
         # We have no way to test powerpc64le features yet, so we're only going to declare the lowest ISA:
         "power8" => ISA(Set{UInt32}()),
diff --git a/cli/trampolines/trampolines_riscv64.S b/cli/trampolines/trampolines_riscv64.S
new file mode 100644
index 0000000000000..26307b7c2bb36
--- /dev/null
+++ b/cli/trampolines/trampolines_riscv64.S
@@ -0,0 +1,20 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+#include "common.h"
+#include "../../src/jl_exported_funcs.inc"
+
+#define SEP ;
+
+#define XX(name) \
+.global CNAME(name) SEP \
+.cfi_startproc SEP \
+.p2align    2 SEP \
+ CNAME(name)##: SEP \
+    auipc t3, %pcrel_hi(CNAMEADDR(name)) SEP \
+    ld t3, %pcrel_lo(CNAME(name))(t3) SEP \
+    jr t3 SEP \
+.cfi_endproc SEP \
+
+JL_RUNTIME_EXPORTED_FUNCS(XX)
+JL_CODEGEN_EXPORTED_FUNCS(XX)
+#undef XX
diff --git a/contrib/generate_precompile.jl b/contrib/generate_precompile.jl
index 60f7290c7a0ac..04d13011d6223 100644
--- a/contrib/generate_precompile.jl
+++ b/contrib/generate_precompile.jl
@@ -202,12 +202,15 @@ if Artifacts !== nothing
     using Artifacts, Base.BinaryPlatforms, Libdl
     artifacts_toml = abspath(joinpath(Sys.STDLIB, "Artifacts", "test", "Artifacts.toml"))
     artifact_hash("HelloWorldC", artifacts_toml)
-    oldpwd = pwd(); cd(dirname(artifacts_toml))
-    macroexpand(Main, :(@artifact_str("HelloWorldC")))
-    cd(oldpwd)
     artifacts = Artifacts.load_artifacts_toml(artifacts_toml)
     platforms = [Artifacts.unpack_platform(e, "HelloWorldC", artifacts_toml) for e in artifacts["HelloWorldC"]]
     best_platform = select_platform(Dict(p => triplet(p) for p in platforms))
+    if best_platform !== nothing
+      # @artifact errors for unsupported platforms
+      oldpwd = pwd(); cd(dirname(artifacts_toml))
+      macroexpand(Main, :(@artifact_str("HelloWorldC")))
+      cd(oldpwd)
+    end
     dlopen("libjulia$(Base.isdebugbuild() ? "-debug" : "")", RTLD_LAZY | RTLD_DEEPBIND)
     """
 end
diff --git a/contrib/normalize_triplet.py b/contrib/normalize_triplet.py
index b1bab29487b8f..833b725480996 100755
--- a/contrib/normalize_triplet.py
+++ b/contrib/normalize_triplet.py
@@ -14,6 +14,7 @@
     'i686': "i\\d86",
     'aarch64': "(arm|aarch)64",
     'armv7l': "arm(v7l)?",
+    'riscv64': "(rv64|riscv64)",
     'powerpc64le': "p(ower)?pc64le",
 }
 platform_mapping = {
diff --git a/doc/src/devdocs/build/build.md b/doc/src/devdocs/build/build.md
index 0ef9ce4e4f071..553f7c2e815cf 100644
--- a/doc/src/devdocs/build/build.md
+++ b/doc/src/devdocs/build/build.md
@@ -148,6 +148,7 @@ Notes for various operating systems:
 Notes for various architectures:
 
 * [ARM](https://github.com/JuliaLang/julia/blob/master/doc/src/devdocs/build/arm.md)
+* [RISC-V](https://github.com/JuliaLang/julia/blob/master/doc/src/devdocs/build/riscv.md)
 
 ## Required Build Tools and External Libraries
 
diff --git a/doc/src/devdocs/build/riscv.md b/doc/src/devdocs/build/riscv.md
new file mode 100644
index 0000000000000..7c0e7ab29d9f8
--- /dev/null
+++ b/doc/src/devdocs/build/riscv.md
@@ -0,0 +1,103 @@
+# RISC-V (Linux)
+
+Julia has experimental support for 64-bit RISC-V (RV64) processors running
+Linux. This file provides general guidelines for compilation, in addition to
+instructions for specific devices.
+
+A list of [known issues](https://github.com/JuliaLang/julia/labels/system:riscv)
+for RISC-V is available. If you encounter difficulties, please create an issue
+including the output from `cat /proc/cpuinfo`.
+
+
+## Compiling Julia
+
+For now, Julia will need to be compiled entirely from source, i.e., including
+all of its dependencies. This can be accomplished with the following
+`Make.user`:
+
+```make
+USE_BINARYBUILDER := 0
+```
+
+Additionally, it is required to indicate what architecture, and optionally which
+CPU to build for. This can be done by setting the `MARCH` and `MCPU` variables
+in `Make.user`
+
+The `MARCH` variable needs to be set to a RISC-V ISA string, which can be found by
+looking at the documentation of your device, or by inspecting `/proc/cpuinfo`. Only
+use flags that your compiler supports, e.g., run `gcc -march=help` to see a list of
+supported flags. A common value is `rv64gc`, which is a good starting point.
+
+The `MCPU` variable is optional, and can be used to further optimize the
+generated code for a specific CPU. If you are unsure, it is recommended to leave
+it unset. You can find a list of supported values by running `gcc --target-help`.
+
+For example, if you are using a StarFive VisionFive2, which contains a JH7110
+processor based on the SiFive U74, you can set these flags as follows:
+
+```make
+MARCH := rv64gc_zba_zbb
+MCPU := sifive-u74
+```
+
+If you prefer a portable build, you could use:
+
+```make
+MARCH := rv64gc
+
+# also set JULIA_CPU_TARGET to the expanded form of rv64gc
+# (it normally copies the value of MCPU, which we don't set)
+JULIA_CPU_TARGET := generic-rv64,i,m,a,f,d,zicsr,zifencei,c
+```
+
+### Cross-compilation
+
+A native build on a RISC-V device may take a very long time, so it's also
+possible to cross-compile Julia on a faster machine.
+
+First, get a hold of a RISC-V cross-compilation toolchain that provides
+support for C, C++ and Fortran. This can be done by checking-out the
+[riscv-gnu-toolchain](https://github.com/riscv-collab/riscv-gnu-toolchain)
+repository and building it as follows:
+
+```sh
+sudo mkdir /opt/riscv && sudo chown $USER /opt/riscv
+./configure --prefix=/opt/riscv --with-languages=c,c++,fortran
+make linux -j$(nproc)
+```
+
+Then, install the QEMU user-mode emulator for RISC-V, along with `binfmt`
+support to enable execution of RISC-V binaries on the host machine. The
+exact steps depend on your distribution, e.g., on Arch Linux it involves
+installing the `qemu-user-static` and `qemu-user-static-binfmt` packages.
+Note that to actually execute RISC-V binaries, QEMU will need to be able to
+find the RISC-V system root, which can be achieved by setting the
+`QEMU_LD_PREFIX` environment variable to the path of the root filesystem.
+
+Finally, compile Julia with the following `Make.user` variables (in addition to
+the ones from the previous section):
+
+```make
+XC_HOST=riscv64-unknown-linux-gnu
+OS=Linux
+export QEMU_LD_PREFIX=/opt/riscv/sysroot
+```
+
+Note that you will have to execute `make` with `PATH` set to include the
+cross-compilation toolchain, e.g., by running:
+
+```sh
+PATH=/opt/riscv/bin:$PATH make -j$(nproc)
+```
+
+Because of the RISC-V sysroot we use being very barren, you may need to
+add additional libraries that the Julia build system currently expects
+to be available system-wide. For example, the build currently relies on
+a system-provided `libz`, so you may need to copy this library from the
+Julia build into the system root:
+
+```sh
+make -C deps install-zlib
+cp -v usr/lib/libz.*   /opt/riscv/sysroot/usr/lib
+cp -v usr/include/z*.h /opt/riscv/sysroot/usr/include
+```
diff --git a/src/abi_riscv.cpp b/src/abi_riscv.cpp
new file mode 100644
index 0000000000000..cbd85892801c8
--- /dev/null
+++ b/src/abi_riscv.cpp
@@ -0,0 +1,315 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+//===----------------------------------------------------------------------===//
+//
+// The ABI implementation used for RISC-V targets.
+//
+//===----------------------------------------------------------------------===//
+//
+// The Procedure Call Standard can be found here:
+// https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-cc.adoc
+//
+// This code is based on:
+// - The Rust implementation:
+//    https://github.com/rust-lang/rust/blob/master/compiler/rustc_target/src/abi/call/riscv.rs
+// - The LLVM RISC-V backend:
+//   https://github.com/llvm/llvm-project/blob/78533528cf5ed04ac78722afff7c9f2f91aa8359/llvm/lib/Target/RISCV/RISCVISelLowering.cpp#L10865
+//
+//===----------------------------------------------------------------------===//
+
+
+struct ABI_RiscvLayout : AbiLayout {
+
+static const size_t XLen = 8;
+static const size_t FLen = 8;
+static const int NumArgGPRs = 8;
+static const int NumArgFPRs = 8;
+
+// available register num is needed to determine if fp pair or int-fp pair in a struct should be unpacked
+// WARN: with this, use_sret must only be called once before the next
+// needPassByRef call, otherwise avail_gprs is wrong
+int avail_gprs, avail_fprs;
+
+// preferred type is determined in the same time of use_sret & needPassByRef
+// cache it here to avoid computing it again in preferred_llvm_type
+Type *cached_llvmtype = NULL;
+
+ABI_RiscvLayout() : avail_gprs(NumArgGPRs), avail_fprs(NumArgFPRs) {}
+
+enum RegPassKind { UNKNOWN = 0, INTEGER = 1, FLOAT = 2 };
+
+struct ElementType {
+    RegPassKind type;
+    jl_datatype_t *dt;
+    ElementType() : type(RegPassKind::UNKNOWN), dt(NULL) {};
+};
+
+bool is_floattype(jl_datatype_t *dt) const
+{
+    return dt == jl_float16_type || dt == jl_float32_type || dt == jl_float64_type;
+}
+
+Type *get_llvm_fptype(jl_datatype_t *dt, LLVMContext &ctx) const
+{
+    assert(is_floattype(dt));
+    switch (jl_datatype_size(dt)) {
+    case 2: return Type::getHalfTy(ctx);
+    case 4: return Type::getFloatTy(ctx);
+    case 8: return Type::getDoubleTy(ctx);
+    case 16: return Type::getFP128Ty(ctx);
+    default: assert(0 && "abi_riscv: unsupported floating point type"); return NULL;
+    }
+}
+
+// for primitive types that can be passed as integer
+// includes integer, bittypes, pointer
+Type *get_llvm_inttype(jl_datatype_t *dt, LLVMContext &ctx) const
+{
+    assert(jl_is_primitivetype(dt));
+    // XXX: without Zfh, Float16 is passed in integer registers
+    if (dt == jl_float16_type)
+        return Type::getInt32Ty(ctx);
+    assert(!is_floattype(dt));
+    if (dt == jl_bool_type)
+        return getInt8Ty(ctx);
+    if (dt == jl_int32_type)
+        return getInt32Ty(ctx);
+    if (dt == jl_int64_type)
+        return getInt64Ty(ctx);
+    int nb = jl_datatype_size(dt);
+    return Type::getIntNTy(ctx, nb * 8);
+}
+
+bool should_use_fp_conv(jl_datatype_t *dt, ElementType &ele1, ElementType &ele2) const
+{
+    if (jl_is_primitivetype(dt)) {
+        size_t dsz = jl_datatype_size(dt);
+        if (dsz > FLen) {
+            return false;
+        }
+        if (is_floattype(dt)) {
+            if (ele1.type == RegPassKind::UNKNOWN) {
+                ele1.type = RegPassKind::FLOAT;
+                ele1.dt = dt;
+            }
+            else if (ele2.type == RegPassKind::UNKNOWN) {
+                ele2.type = RegPassKind::FLOAT;
+                ele2.dt = dt;
+            }
+            else {
+                // 3 elements not eligible, must be a pair
+                return false;
+            }
+        }
+        // integer or pointer type or bitstypes
+        else {
+            if (ele1.type == RegPassKind::UNKNOWN) {
+                ele1.type = RegPassKind::INTEGER;
+                ele1.dt = dt;
+            }
+            else if (ele1.type == RegPassKind::INTEGER) {
+                // two integers not eligible
+                return false;
+            }
+            // ele1.type == RegPassKind::FLOAT
+            else {
+                if (ele2.type == RegPassKind::UNKNOWN) {
+                    ele2.type = RegPassKind::INTEGER;
+                    ele2.dt = dt;
+                }
+                else {
+                    // 3 elements not eligible, must be a pair
+                    return false;
+                }
+            }
+        }
+    }
+    else { // aggregates
+        while (size_t nfields = jl_datatype_nfields(dt)) {
+            size_t i;
+            size_t fieldsz;
+            for (i = 0; i < nfields; i++) {
+                if ((fieldsz = jl_field_size(dt, i))) {
+                    break;
+                }
+            }
+            assert(i < nfields);
+            // If there's only one non zero sized member, try again on this member
+            if (fieldsz == jl_datatype_size(dt)) {
+                dt = (jl_datatype_t *)jl_field_type(dt, i);
+                if (!jl_is_datatype(dt)) // could be inline union #46787
+                    return false;
+                continue;
+            }
+            for (; i < nfields; i++) {
+                size_t fieldsz = jl_field_size(dt, i);
+                if (fieldsz == 0)
+                    continue;
+                jl_datatype_t *fieldtype = (jl_datatype_t *)jl_field_type(dt, i);
+                if (!jl_is_datatype(dt)) // could be inline union
+                    return false;
+                // This needs to be done after the zero size member check
+                if (ele2.type != RegPassKind::UNKNOWN) {
+                    // we already have a pair and can't accept more elements
+                    return false;
+                }
+                if (!should_use_fp_conv(fieldtype, ele1, ele2)) {
+                    return false;
+                }
+            }
+            break;
+        }
+    }
+    // Tuple{Int,} can reach here as well, but doesn't really hurt
+    return true;
+}
+
+Type *get_llvm_inttype_byxlen(size_t xlen, LLVMContext &ctx) const
+{
+    if (xlen == 8) {
+        return getInt64Ty(ctx);
+    }
+    else if (xlen == 4) {
+        return getInt32Ty(ctx);
+    }
+    else {
+        assert(0 && "abi_riscv: unsupported xlen");
+        return NULL;
+    }
+}
+
+Type *classify_arg(jl_datatype_t *ty, int &avail_gprs, int &avail_fprs, bool &onstack,
+                   LLVMContext &ctx) const
+{
+    onstack = false;
+    if (ty == jl_nothing_type) {
+        return NULL;
+    }
+    ElementType ele1, ele2;
+    if (should_use_fp_conv(ty, ele1, ele2)) {
+        if (ele1.type == RegPassKind::FLOAT) {
+            if (ele2.type == RegPassKind::FLOAT) {
+                if (avail_fprs >= 2) {
+                    avail_fprs -= 2;
+                    SmallVector<Type *, 2> eles;
+                    eles.push_back(get_llvm_fptype(ele1.dt, ctx));
+                    eles.push_back(get_llvm_fptype(ele2.dt, ctx));
+                    return StructType::get(ctx, eles);
+                }
+            }
+            else if (ele2.type == RegPassKind::INTEGER) {
+                if (avail_fprs >= 1 && avail_gprs >= 1) {
+                    avail_fprs -= 1;
+                    avail_gprs -= 1;
+                    SmallVector<Type *, 2> eles;
+                    eles.push_back(get_llvm_fptype(ele1.dt, ctx));
+                    eles.push_back(get_llvm_inttype(ele2.dt, ctx));
+                    return StructType::get(ctx, eles);
+                }
+            }
+            else {
+                // A struct containing just one floating-point real is passed
+                // as though it were a standalone floating-point real.
+                if (avail_fprs >= 1) {
+                    avail_fprs -= 1;
+                    return get_llvm_fptype(ele1.dt, ctx);
+                }
+            }
+        }
+        else if (ele1.type == RegPassKind::INTEGER) {
+            if (ele2.type == RegPassKind::FLOAT) {
+                if (avail_fprs >= 1 && avail_gprs >= 1) {
+                    avail_fprs -= 1;
+                    avail_gprs -= 1;
+                    return StructType::get(get_llvm_inttype(ele1.dt, ctx),
+                                           get_llvm_fptype(ele2.dt, ctx));
+                }
+            }
+        }
+    }
+    size_t dsz = jl_datatype_size(ty);
+    if (dsz > 2 * XLen) {
+        if (!jl_is_primitivetype(ty)) {
+            onstack = true;
+        }
+        // else let llvm backend handle scalars
+        if (avail_gprs >= 1) {
+            avail_gprs -= 1;
+        }
+        return NULL;
+    }
+
+    if (dsz > XLen) {
+        size_t alignment = jl_datatype_align(ty);
+        bool align_regs = alignment > XLen;
+        if (avail_gprs >= 2) {
+            avail_gprs -= 2;
+        }
+        // should we handle variadic as well?
+        // Variadic arguments with 2×XLEN-bit alignment and size at most 2×XLEN
+        // bits are passed in an aligned register pair
+        else {
+            avail_gprs = 0;
+        }
+
+        if (!jl_is_primitivetype(ty)) {
+            // Aggregates or scalars passed on the stack are aligned to the
+            // greater of the type alignment and XLen bits, but never more than
+            // the stack alignment.
+            if (align_regs) {
+                if (alignment == 16) {
+                    return Type::getInt128Ty(ctx);
+                }
+                else {
+                    return Type::getInt64Ty(ctx);
+                }
+            }
+            else {
+                return ArrayType::get(get_llvm_inttype_byxlen(XLen, ctx), 2);
+            }
+        }
+        // let llvm backend handle scalars
+        return NULL;
+    }
+
+    //else dsz <= XLen
+    if (avail_gprs >= 1) {
+        avail_gprs -= 1;
+    }
+    if (!jl_is_primitivetype(ty)) {
+        return get_llvm_inttype_byxlen(XLen, ctx);
+    }
+    return get_llvm_inttype(ty, ctx);
+}
+
+bool use_sret(jl_datatype_t *ty, LLVMContext &ctx) override
+{
+    bool onstack = false;
+    int gprs = 2;
+    int fprs = FLen ? 2 : 0;
+    this->cached_llvmtype = classify_arg(ty, gprs, fprs, onstack, ctx);
+    if (onstack) {
+        this->avail_gprs -= 1;
+        return true;
+    }
+    else {
+        return false;
+    }
+}
+
+bool needPassByRef(jl_datatype_t *ty, AttrBuilder &ab, LLVMContext &ctx,
+                   Type *Ty) override
+{
+    bool onstack = false;
+    this->cached_llvmtype =
+        classify_arg(ty, this->avail_gprs, this->avail_fprs, onstack, ctx);
+    return onstack;
+}
+
+Type *preferred_llvm_type(jl_datatype_t *ty, bool isret,
+                          LLVMContext &ctx) const override
+{
+    return this->cached_llvmtype;
+}
+
+};
diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index c2f112f9c9d5c..279686c387e1b 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -1664,7 +1664,8 @@ void jl_dump_native_impl(void *native_code,
     }
 
     CodeModel::Model CMModel = CodeModel::Small;
-    if (TheTriple.isPPC() || (TheTriple.isX86() && TheTriple.isArch64Bit() && TheTriple.isOSLinux())) {
+    if (TheTriple.isPPC() || TheTriple.isRISCV() ||
+        (TheTriple.isX86() && TheTriple.isArch64Bit() && TheTriple.isOSLinux())) {
         // On PPC the small model is limited to 16bit offsets. For very large images the small code model
         CMModel = CodeModel::Medium; //  isn't good enough on x86 so use Medium, it has no cost because only the image goes in .ldata
     }
diff --git a/src/ccall.cpp b/src/ccall.cpp
index 2de5be6906e7c..f559ddbe93a43 100644
--- a/src/ccall.cpp
+++ b/src/ccall.cpp
@@ -367,6 +367,7 @@ static bool is_native_simd_type(jl_datatype_t *dt) {
 
 #include "abi_arm.cpp"
 #include "abi_aarch64.cpp"
+#include "abi_riscv.cpp"
 #include "abi_ppc64le.cpp"
 #include "abi_win32.cpp"
 #include "abi_win64.cpp"
@@ -391,6 +392,8 @@ static bool is_native_simd_type(jl_datatype_t *dt) {
   typedef ABI_ARMLayout DefaultAbiState;
 #elif defined _CPU_AARCH64_
   typedef ABI_AArch64Layout DefaultAbiState;
+#elif defined _CPU_RISCV64_
+  typedef ABI_RiscvLayout DefaultAbiState;
 #elif defined _CPU_PPC64_
   typedef ABI_PPC64leLayout DefaultAbiState;
 #else
diff --git a/src/codegen.cpp b/src/codegen.cpp
index bcda527416676..ca38ae8ddb288 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -5368,7 +5368,7 @@ static jl_cgval_t emit_call_specfun_other(jl_codectx_t &ctx, bool is_opaque_clos
     }
     CallInst *call = ctx.builder.CreateCall(cft, TheCallee, argvals);
     call->setAttributes(returninfo.attrs);
-    if (gcstack_arg)
+    if (gcstack_arg && !ctx.emission_context.TargetTriple.isRISCV())
         call->setCallingConv(CallingConv::Swift);
 
     jl_cgval_t retval;
@@ -8186,7 +8186,8 @@ static jl_returninfo_t get_specsig_function(jl_codectx_t &ctx, Module *M, Value
 
     if (gcstack_arg){
         AttrBuilder param(ctx.builder.getContext());
-        param.addAttribute(Attribute::SwiftSelf);
+        if (!ctx.emission_context.TargetTriple.isRISCV())
+            param.addAttribute(Attribute::SwiftSelf);
         param.addAttribute(Attribute::NonNull);
         attrs.push_back(AttributeSet::get(ctx.builder.getContext(), param));
         fsig.push_back(PointerType::get(JuliaType::get_ppjlvalue_ty(ctx.builder.getContext()), 0));
@@ -8278,7 +8279,7 @@ static jl_returninfo_t get_specsig_function(jl_codectx_t &ctx, Module *M, Value
             fval = emit_inttoptr(ctx, fval, ftype->getPointerTo());
     }
     if (auto F = dyn_cast<Function>(fval)) {
-        if (gcstack_arg)
+        if (gcstack_arg && !ctx.emission_context.TargetTriple.isRISCV())
             F->setCallingConv(CallingConv::Swift);
         assert(F->arg_size() >= argnames.size());
         for (size_t i = 0; i < argnames.size(); i++) {
diff --git a/src/disasm.cpp b/src/disasm.cpp
index ebe8f2ac397c0..b944e48430c29 100644
--- a/src/disasm.cpp
+++ b/src/disasm.cpp
@@ -1058,6 +1058,8 @@ static void jl_dump_asm_internal(
                 if (insSize == 0) // skip illegible bytes
 #if defined(_CPU_PPC_) || defined(_CPU_PPC64_) || defined(_CPU_ARM_) || defined(_CPU_AARCH64_)
                     insSize = 4; // instructions are always 4 bytes
+#elif defined(_CPU_RISCV64_)
+                    insSize = 2; // instructions can be 2 bytes when compressed
 #else
                     insSize = 1; // attempt to slide 1 byte forward
 #endif
diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp
index 4ff7400df13dd..313449dda5557 100644
--- a/src/jitlayers.cpp
+++ b/src/jitlayers.cpp
@@ -998,6 +998,16 @@ namespace {
 #if defined(MSAN_EMUTLS_WORKAROUND)
         options.EmulatedTLS = true;
         options.ExplicitEmulatedTLS = true;
+#endif
+#if defined(_CPU_RISCV64_)
+        // we set these manually to avoid LLVM defaulting to soft-float
+#if defined(__riscv_float_abi_double)
+        options.MCOptions.ABIName = "lp64d";
+#elif defined(__riscv_float_abi_single)
+        options.MCOptions.ABIName = "lp64f";
+#else
+        options.MCOptions.ABIName = "lp64";
+#endif
 #endif
         uint32_t target_flags = 0;
         auto target = jl_get_llvm_target(imaging_default(), target_flags);
@@ -1042,11 +1052,23 @@ namespace {
 #endif
         if (TheTriple.isAArch64())
             codemodel = CodeModel::Small;
+        else if (TheTriple.isRISCV()) {
+            // RISC-V will support large code model in LLVM 21
+            // https://github.com/llvm/llvm-project/pull/70308
+            codemodel = CodeModel::Medium;
+        }
+        // Generate simpler code for JIT
+        Reloc::Model relocmodel = Reloc::Static;
+        if (TheTriple.isRISCV()) {
+            // until large code model is supported, use PIC for RISC-V
+            // https://github.com/llvm/llvm-project/issues/106203
+            relocmodel = Reloc::PIC_;
+        }
         auto optlevel = CodeGenOptLevelFor(jl_options.opt_level);
         auto TM = TheTarget->createTargetMachine(
                 TheTriple.getTriple(), TheCPU, FeaturesStr,
                 options,
-                Reloc::Static, // Generate simpler code for JIT
+                relocmodel,
                 codemodel,
                 optlevel,
                 true // JIT
@@ -1067,7 +1089,7 @@ namespace {
             .setCPU(TM.getTargetCPU().str())
             .setFeatures(TM.getTargetFeatureString())
             .setOptions(TM.Options)
-            .setRelocationModel(Reloc::Static)
+            .setRelocationModel(TM.getRelocationModel())
             .setCodeModel(TM.getCodeModel())
             .setCodeGenOptLevel(CodeGenOptLevelFor(optlevel));
     }
diff --git a/src/jitlayers.h b/src/jitlayers.h
index 3353a4093bd27..47ab369f1e24a 100644
--- a/src/jitlayers.h
+++ b/src/jitlayers.h
@@ -58,6 +58,10 @@
 # define JL_USE_JITLINK
 #endif
 
+#if defined(_CPU_RISCV64_)
+# define JL_USE_JITLINK
+#endif
+
 # include <llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h>
 # include <llvm/ExecutionEngine/RTDyldMemoryManager.h>
 # include <llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h>
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 20d90fede3d5e..c09bfc5c3eb42 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -105,8 +105,8 @@ JL_DLLIMPORT void __tsan_switch_to_fiber(void *fiber, unsigned flags);
 #ifndef _OS_WINDOWS_
     #if defined(_CPU_ARM_) || defined(_CPU_PPC_) || defined(_CPU_WASM_)
         #define MAX_ALIGN 8
-    #elif defined(_CPU_AARCH64_) || (JL_LLVM_VERSION >= 180000 && (defined(_CPU_X86_64_) || defined(_CPU_X86_)))
-    // int128 is 16 bytes aligned on aarch64 and on x86 with LLVM >= 18
+    #elif defined(_CPU_AARCH64_) || defined(_CPU_RISCV64_) || (JL_LLVM_VERSION >= 180000 && (defined(_CPU_X86_64_) || defined(_CPU_X86_)))
+    // int128 is 16 bytes aligned on aarch64 and riscv, and on x86 with LLVM >= 18
         #define MAX_ALIGN 16
     #elif defined(_P64)
     // Generically we assume MAX_ALIGN is sizeof(void*)
@@ -259,6 +259,11 @@ static inline uint64_t cycleclock(void) JL_NOTSAFEPOINT
     struct timeval tv;
     gettimeofday(&tv, NULL);
     return (int64_t)(tv.tv_sec) * 1000000 + tv.tv_usec;
+#elif defined(_CPU_RISCV64_)
+    // taken from https://github.com/google/benchmark/blob/3b3de69400164013199ea448f051d94d7fc7d81f/src/cycleclock.h#L190
+    uint64_t ret;
+    __asm__ volatile("rdcycle %0" : "=r"(ret));
+    return ret;
 #elif defined(_CPU_PPC64_)
     // This returns a time-base, which is not always precisely a cycle-count.
     // https://reviews.llvm.org/D78084
diff --git a/src/julia_threads.h b/src/julia_threads.h
index b697a0bf030ed..17e8d7d466044 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -56,7 +56,7 @@ typedef struct {
     !defined(JL_HAVE_ASM) && \
     !defined(JL_HAVE_UNW_CONTEXT)
 #if (defined(_CPU_X86_64_) || defined(_CPU_X86_) || defined(_CPU_AARCH64_) ||  \
-     defined(_CPU_ARM_) || defined(_CPU_PPC64_))
+     defined(_CPU_ARM_) || defined(_CPU_PPC64_) || defined(_CPU_RISCV64_))
 #define JL_HAVE_ASM
 #endif
 #if 0
diff --git a/src/llvm-ptls.cpp b/src/llvm-ptls.cpp
index 488dd46cade21..614ed15f840e6 100644
--- a/src/llvm-ptls.cpp
+++ b/src/llvm-ptls.cpp
@@ -117,6 +117,8 @@ Instruction *LowerPTLS::emit_pgcstack_tp(Value *offset, Instruction *insertBefor
             asm_str = "mrs $0, tpidr_el0";
         } else if (TargetTriple.isARM()) {
             asm_str = "mrc p15, 0, $0, c13, c0, 3";
+        } else if (TargetTriple.isRISCV()) {
+            asm_str = "mv $0, tp";
         } else if (TargetTriple.getArch() == Triple::x86_64) {
             asm_str = "movq %fs:0, $0";
         } else if (TargetTriple.getArch() == Triple::x86) {
diff --git a/src/llvm-version.h b/src/llvm-version.h
index 2a38bb7c488b8..984e918d480cc 100644
--- a/src/llvm-version.h
+++ b/src/llvm-version.h
@@ -18,6 +18,10 @@
     #define JL_LLVM_OPAQUE_POINTERS 1
 #endif
 
+#if JL_LLVM_VERSION < 19000 && defined(_CPU_RISCV64_)
+    #error Only LLVM versions >= 19.0.0 are supported by Julia on RISC-V
+#endif
+
 #ifdef __cplusplus
 #if defined(__GNUC__) && (__GNUC__ >= 9)
 // Added in GCC 9, this warning is annoying
diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c
index db4007d32035e..450096eef5b01 100644
--- a/src/runtime_intrinsics.c
+++ b/src/runtime_intrinsics.c
@@ -256,7 +256,7 @@ JL_DLLEXPORT float julia_half_to_float(uint16_t param) {
 #if ((defined(__GNUC__) && __GNUC__ > 11) || \
      (defined(__clang__) && __clang_major__ > 14)) && \
     !defined(_CPU_PPC64_) && !defined(_CPU_PPC_) && \
-    !defined(_OS_WINDOWS_)
+    !defined(_OS_WINDOWS_) && !defined(_CPU_RISCV64_)
     #define FLOAT16_TYPE _Float16
     #define FLOAT16_TO_UINT16(x) (*(uint16_t*)&(x))
     #define FLOAT16_FROM_UINT16(x) (*(_Float16*)&(x))
@@ -355,7 +355,7 @@ float julia_bfloat_to_float(uint16_t param) {
 #if ((defined(__GNUC__) && __GNUC__ > 12) || \
      (defined(__clang__) && __clang_major__ > 16)) && \
     !defined(_CPU_PPC64_) && !defined(_CPU_PPC_) && \
-    !defined(_OS_WINDOWS_)
+    !defined(_OS_WINDOWS_) && !defined(_CPU_RISCV64_)
     #define BFLOAT16_TYPE __bf16
     #define BFLOAT16_TO_UINT16(x) (*(uint16_t*)&(x))
     #define BFLOAT16_FROM_UINT16(x) (*(__bf16*)&(x))
diff --git a/src/signal-handling.c b/src/signal-handling.c
index d7f4697a3c4f0..ce7e8ba57af19 100644
--- a/src/signal-handling.c
+++ b/src/signal-handling.c
@@ -256,7 +256,8 @@ static uintptr_t jl_get_pc_from_ctx(const void *_ctx);
 void jl_show_sigill(void *_ctx);
 #if defined(_CPU_X86_64_) || defined(_CPU_X86_) \
     || (defined(_OS_LINUX_) && defined(_CPU_AARCH64_)) \
-    || (defined(_OS_LINUX_) && defined(_CPU_ARM_))
+    || (defined(_OS_LINUX_) && defined(_CPU_ARM_)) \
+    || (defined(_OS_LINUX_) && defined(_CPU_RISCV64_))
 static size_t jl_safe_read_mem(const volatile char *ptr, char *out, size_t len)
 {
     jl_jmp_buf *old_buf = jl_get_safe_restore();
@@ -344,6 +345,8 @@ static uintptr_t jl_get_pc_from_ctx(const void *_ctx)
     return ((ucontext_t*)_ctx)->uc_mcontext.mc_gpregs.gp_elr;
 #elif defined(_OS_LINUX_) && defined(_CPU_ARM_)
     return ((ucontext_t*)_ctx)->uc_mcontext.arm_pc;
+#elif defined(_OS_LINUX_) && defined(_CPU_RISCV64_)
+    return ((ucontext_t*)_ctx)->uc_mcontext.__gregs[REG_PC];
 #else
     // TODO for PPC
     return 0;
@@ -421,6 +424,20 @@ void jl_show_sigill(void *_ctx)
             jl_safe_printf("Invalid ARM instruction at %p: 0x%08" PRIx32 "\n", (void*)pc, inst);
         }
     }
+#elif defined(_OS_LINUX_) && defined(_CPU_RISCV64_)
+    uint32_t inst = 0;
+    size_t len = jl_safe_read_mem(pc, (char*)&inst, 4);
+    if (len < 2)
+        jl_safe_printf("Fault when reading instruction: %d bytes read\n", (int)len);
+    if (inst == 0x00100073 || // ebreak
+        inst == 0xc0001073 || // unimp (pseudo-instruction for illegal `csrrw x0, cycle, x0`)
+        (inst & ((1 << 16) - 1)) == 0x0000) { // c.unimp (compressed form)
+        // The signal might actually be SIGTRAP instead, doesn't hurt to handle it here though.
+        jl_safe_printf("Unreachable reached at %p\n", pc);
+    }
+    else {
+        jl_safe_printf("Invalid instruction at %p: 0x%08" PRIx32 "\n", pc, inst);
+    }
 #else
     // TODO for PPC
     (void)_ctx;
diff --git a/src/signals-unix.c b/src/signals-unix.c
index f99eca31730b6..caf0e977929c5 100644
--- a/src/signals-unix.c
+++ b/src/signals-unix.c
@@ -80,6 +80,9 @@ static inline uintptr_t jl_get_rsp_from_ctx(const void *_ctx)
 #elif defined(_OS_LINUX_) && defined(_CPU_ARM_)
     const ucontext_t *ctx = (const ucontext_t*)_ctx;
     return ctx->uc_mcontext.arm_sp;
+#elif defined(_OS_LINUX_) && (defined(_CPU_RISCV64_))
+    const ucontext_t *ctx = (const ucontext_t*)_ctx;
+    return ctx->uc_mcontext.__gregs[REG_SP];
 #elif defined(_OS_FREEBSD_) && defined(_CPU_X86_64_)
     const ucontext_t *ctx = (const ucontext_t*)_ctx;
     return ctx->uc_mcontext.mc_rsp;
@@ -175,6 +178,11 @@ JL_NO_ASAN static void jl_call_in_ctx(jl_ptls_t ptls, void (*fptr)(void), int si
     ctx->uc_mcontext.arm_sp = rsp;
     ctx->uc_mcontext.arm_lr = 0; // Clear link register
     ctx->uc_mcontext.arm_pc = target;
+#elif defined(_OS_LINUX_) && (defined(_CPU_RISCV64_))
+    ucontext_t *ctx = (ucontext_t*)_ctx;
+    ctx->uc_mcontext.__gregs[REG_SP] = rsp;
+    ctx->uc_mcontext.__gregs[REG_RA] = 0; // Clear return address address (ra)
+    ctx->uc_mcontext.__gregs[REG_PC] = (uintptr_t)fptr;
 #else
 #pragma message("julia: throw-in-context not supported on this platform")
     // TODO Add support for PowerPC(64)?
diff --git a/src/stackwalk.c b/src/stackwalk.c
index 5377d091cb780..460bfafb97d2b 100644
--- a/src/stackwalk.c
+++ b/src/stackwalk.c
@@ -1066,6 +1066,45 @@ int jl_simulate_longjmp(jl_jmp_buf mctx, bt_context_t *c) JL_NOTSAFEPOINT
     mc->regs[0] = 1;
     assert(mc->sp % 16 == 0);
     return 1;
+    #elif defined(_CPU_RISCV64_)
+    // https://github.com/bminor/glibc/blob/master/sysdeps/riscv/bits/setjmp.h
+    // https://github.com/llvm/llvm-project/blob/7714e0317520207572168388f22012dd9e152e9e/libunwind/src/Registers.hpp -> Registers_riscv
+    // https://github.com/llvm/llvm-project/blob/90149204bd08c07eb672cd5b19d782fed3d96ddc/libunwind/include/libunwind.h
+    mc->__gregs[1] = (*_ctx)->__pc;        // ra
+    mc->__gregs[8] = (*_ctx)->__regs[0];   // s0
+    mc->__gregs[9] = (*_ctx)->__regs[1];   // s1
+    mc->__gregs[18] = (*_ctx)->__regs[2];  // s2
+    mc->__gregs[19] = (*_ctx)->__regs[3];  // s3
+    mc->__gregs[20] = (*_ctx)->__regs[4];  // s4
+    mc->__gregs[21] = (*_ctx)->__regs[5];  // s5
+    mc->__gregs[22] = (*_ctx)->__regs[6];  // s6
+    mc->__gregs[23] = (*_ctx)->__regs[7];  // s7
+    mc->__gregs[24] = (*_ctx)->__regs[8];  // s8
+    mc->__gregs[25] = (*_ctx)->__regs[9];  // s9
+    mc->__gregs[26] = (*_ctx)->__regs[10]; // s10
+    mc->__gregs[27] = (*_ctx)->__regs[11]; // s11
+    mc->__gregs[2] = (*_ctx)->__sp;        // sp
+    #ifndef __riscv_float_abi_soft
+    mc->__fpregs.__q.__f[40] = (*_ctx)->__fpregs[0]; // fs0
+    mc->__fpregs.__q.__f[41] = (*_ctx)->__fpregs[1]; // fs1
+    mc->__fpregs.__q.__f[50] = (*_ctx)->__fpregs[2]; // fs2
+    mc->__fpregs.__q.__f[51] = (*_ctx)->__fpregs[3]; // fs3
+    mc->__fpregs.__q.__f[52] = (*_ctx)->__fpregs[4]; // fs4
+    mc->__fpregs.__q.__f[53] = (*_ctx)->__fpregs[5]; // fs5
+    mc->__fpregs.__q.__f[54] = (*_ctx)->__fpregs[6]; // fs6
+    mc->__fpregs.__q.__f[55] = (*_ctx)->__fpregs[7]; // fs7
+    mc->__fpregs.__q.__f[56] = (*_ctx)->__fpregs[8]; // fs8
+    mc->__fpregs.__q.__f[57] = (*_ctx)->__fpregs[9]; // fs9
+    mc->__fpregs.__q.__f[58] = (*_ctx)->__fpregs[10]; // fs10
+    mc->__fpregs.__q.__f[59] = (*_ctx)->__fpregs[11]; // fs11
+    #endif
+    // ifdef PTR_DEMANGLE ?
+    mc->__gregs[REG_SP] = ptr_demangle(mc->__gregs[REG_SP]);
+    mc->__gregs[REG_RA] = ptr_demangle(mc->__gregs[REG_RA]);
+    mc->__gregs[REG_PC] = mc->__gregs[REG_RA];
+    mc->__gregs[REG_A0] = 1;
+    assert(mc->__gregs[REG_SP] % 16 == 0);
+    return 1;
     #else
     #pragma message("jl_record_backtrace not defined for ASM/SETJMP on unknown linux")
     (void)mc;
diff --git a/src/support/platform.h b/src/support/platform.h
index a0dd84c9c20b6..816e2090b5a08 100644
--- a/src/support/platform.h
+++ b/src/support/platform.h
@@ -27,6 +27,7 @@
  *          _CPU_X86_64_
  *          _CPU_AARCH64_
  *          _CPU_ARM_
+ *          _CPU_RISCV64_
  *          _CPU_WASM_
  */
 
@@ -106,6 +107,8 @@
 #define _CPU_AARCH64_
 #elif defined(__arm__) || defined(_M_ARM)
 #define _CPU_ARM_
+#elif defined(__riscv) && __riscv_xlen == 64
+#define _CPU_RISCV64_
 #elif defined(__PPC64__)
 #define _CPU_PPC64_
 #elif defined(_ARCH_PPC)
diff --git a/src/task.c b/src/task.c
index f86e0ab3a880d..be2631347e82e 100644
--- a/src/task.c
+++ b/src/task.c
@@ -1491,6 +1491,14 @@ CFI_NORETURN
                     // because all our addresses are word-aligned.
         " udf #0" // abort
         : : "r" (stk), "r"(fn) : "memory" );
+#elif defined(_CPU_RISCV64_)
+    asm volatile(
+        " mv sp, %0;\n"
+        " mv ra, zero;\n" // Clear return address register
+        " mv fp, zero;\n" // Clear frame pointer
+        " jr %1;\n" // call `fn` with fake stack frame
+        " ebreak" // abort
+        : : "r"(stk), "r"(fn) : "memory" );
 #elif defined(_CPU_PPC64_)
     // N.B.: There is two iterations of the PPC64 ABI.
     // v2 is current and used here. Make sure you have the
diff --git a/src/threading.c b/src/threading.c
index c26028d2f3da2..50944a24eb29b 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -18,7 +18,7 @@
 // For variant 1 JL_ELF_TLS_INIT_SIZE is the size of the thread control block (TCB)
 // For variant 2 JL_ELF_TLS_INIT_SIZE is 0
 #if defined(_OS_LINUX_) || defined(_OS_FREEBSD_)
-#  if defined(_CPU_X86_64_) || defined(_CPU_X86_)
+#  if defined(_CPU_X86_64_) || defined(_CPU_X86_) || defined(_CPU_RISCV64_)
 #    define JL_ELF_TLS_VARIANT 2
 #    define JL_ELF_TLS_INIT_SIZE 0
 #  elif defined(_CPU_AARCH64_)
@@ -638,6 +638,8 @@ static void jl_check_tls(void)
     asm("mrs %0, tpidr_el0" : "=r"(tp));
 #elif defined(__ARM_ARCH) && __ARM_ARCH >= 7
     asm("mrc p15, 0, %0, c13, c0, 3" : "=r"(tp));
+#elif defined(_CPU_RISCV64_)
+    asm("mv %0, tp" : "=r"(tp));
 #else
 #  error "Cannot emit thread pointer for this architecture."
 #endif