From b60444079e5482f756a03a88998436a337590f60 Mon Sep 17 00:00:00 2001 From: "j-h.choi" Date: Thu, 9 Nov 2023 09:23:45 +0900 Subject: [PATCH 1/2] [RISC-V] Initial patch to fix RISCV64 interpreter --- src/coreclr/vm/interpreter.cpp | 62 ++++++++++++++--- src/coreclr/vm/riscv64/cgencpu.h | 4 +- src/coreclr/vm/riscv64/stubs.cpp | 110 +++++++++++++++++++++++++++++++ src/coreclr/vm/stublink.cpp | 31 ++++++++- src/coreclr/vm/stublink.h | 12 ++++ 5 files changed, 208 insertions(+), 11 deletions(-) diff --git a/src/coreclr/vm/interpreter.cpp b/src/coreclr/vm/interpreter.cpp index 6fe37bfb5aa98..9f18d7a117c68 100644 --- a/src/coreclr/vm/interpreter.cpp +++ b/src/coreclr/vm/interpreter.cpp @@ -911,6 +911,9 @@ CorJitResult Interpreter::GenerateInterpreterStub(CEEInfo* comp, // x8 through x15 are scratch registers on ARM64. IntReg x8 = IntReg(8); IntReg x9 = IntReg(9); + +#elif defined(HOST_RISCV64) +#else #error unsupported platform #endif } @@ -1073,7 +1076,7 @@ CorJitResult Interpreter::GenerateInterpreterStub(CEEInfo* comp, argState.AddArg(vaSigCookieIndex); } -#if defined(HOST_ARM) || defined(HOST_AMD64) || defined(HOST_ARM64) +#if defined(HOST_ARM) || defined(HOST_AMD64) || defined(HOST_ARM64) || defined(HOST_RISCV64) // Generics context comes before args on ARM. Would be better if I factored this out as a call, // to avoid large swatches of duplicate code. if (hasGenericsContextArg) @@ -1081,7 +1084,7 @@ CorJitResult Interpreter::GenerateInterpreterStub(CEEInfo* comp, argPerm[genericsContextArgIndex] = physArgIndex; physArgIndex++; argState.AddArg(genericsContextArgIndex); } -#endif // HOST_ARM || HOST_AMD64 || HOST_ARM64 +#endif // HOST_ARM || HOST_AMD64 || HOST_ARM64 || HOST_RISCV64 CORINFO_ARG_LIST_HANDLE argPtr = info->args.args; // Some arguments are have been passed in registers, some in memory. We must generate code that @@ -1432,7 +1435,7 @@ CorJitResult Interpreter::GenerateInterpreterStub(CEEInfo* comp, sl.X86EmitPopReg(kEBP); sl.X86EmitReturn(static_cast(argState.callerArgStackSlots * sizeof(void*))); #elif defined(UNIX_AMD64_ABI) - bool hasTowRetSlots = info->args.retType == CORINFO_TYPE_VALUECLASS && + bool hasTwoRetSlots = info->args.retType == CORINFO_TYPE_VALUECLASS && getClassSize(info->args.retTypeClass) == 16; int fixedTwoSlotSize = 16; @@ -1484,7 +1487,7 @@ CorJitResult Interpreter::GenerateInterpreterStub(CEEInfo* comp, sl.X86EmitRegLoad(ARGUMENT_kREG1, reinterpret_cast(interpMethInfo)); sl.X86EmitCall(sl.NewExternalCodeLabel(interpretMethodFunc), 0); - if (hasTowRetSlots) { + if (hasTwoRetSlots) { sl.X86EmitEspOffset(0x8b, kRAX, 0); sl.X86EmitEspOffset(0x8b, kRDX, 8); } @@ -1635,7 +1638,40 @@ CorJitResult Interpreter::GenerateInterpreterStub(CEEInfo* comp, #elif defined(HOST_LOONGARCH64) assert(!"unimplemented on LOONGARCH yet"); #elif defined(HOST_RISCV64) - assert(!"unimplemented on RISCV64 yet"); + bool hasTwoRetSlots = info->args.retType == CORINFO_TYPE_VALUECLASS && + getClassSize(info->args.retTypeClass) == 16; + + UINT stackFrameSize = argState.numFPRegArgSlots; + + sl.EmitProlog(argState.numRegArgs, argState.numFPRegArgSlots, hasTwoRetSlots ? 2 * sizeof(void*) : 0); + +#if INTERP_ILSTUBS + if (pMD->IsILStub()) + { + // Third argument is stubcontext, in t2 (METHODDESC_REGISTER). + sl.EmitMovReg(IntReg(12), IntReg(7)); + } + else +#endif + { + // For a non-ILStub method, push NULL as the third StubContext argument. + sl.EmitMovConstant(IntReg(12), 0); + } + // Second arg is pointer to the base of the ILargs arr -- i.e., the current stack value. + sl.EmitAddImm(IntReg(11), RegSp, sl.GetSavedRegArgsOffset()); + + // First arg is the pointer to the interpMethodInfo structure + sl.EmitMovConstant(IntReg(10), reinterpret_cast(interpMethInfo)); + + sl.EmitCallLabel(sl.NewExternalCodeLabel((LPVOID)interpretMethodFunc), FALSE, FALSE); + if (hasTwoRetSlots) + { + // TODO: handle return registers to use int or float registers + sl.EmitLoad(IntReg(10), RegSp, 0); + sl.EmitLoad(IntReg(11), RegSp, sizeof(void*)); + } + + sl.EmitEpilog(); #else #error unsupported platform #endif @@ -2430,6 +2466,14 @@ void Interpreter::ExecuteMethod(ARG_SLOT* retVal, _Out_ bool* pDoJmpCall, _Out_ //The Fixed Two slot return buffer address memcpy(m_ilArgs-16, OpStackGet(0), sz); } +#elif defined(TARGET_RISCV64) + // Is it an struct contained in two slots + else if (m_methInfo->m_returnType == CORINFO_TYPE_VALUECLASS + && sz == 16) + { + //The Fixed Two slot return buffer address + memcpy(m_ilArgs-32, OpStackGet(0), sz); + } #endif else if (CorInfoTypeIsFloatingPoint(m_methInfo->m_returnType) && CorInfoTypeIsFloatingPoint(retValIt.ToCorInfoType())) @@ -9448,7 +9492,7 @@ void Interpreter::DoCallWork(bool virtualCall, void* thisArg, CORINFO_RESOLVED_T HFAReturnArgSlots = (HFAReturnArgSlots + sizeof(ARG_SLOT) - 1) / sizeof(ARG_SLOT); } } -#elif defined(UNIX_AMD64_ABI) +#elif defined(UNIX_AMD64_ABI) || defined(TARGET_RISCV64) unsigned HasTwoSlotBuf = sigInfo.retType == CORINFO_TYPE_VALUECLASS && getClassSize(sigInfo.retTypeClass) == 16; #endif @@ -9689,7 +9733,7 @@ void Interpreter::DoCallWork(bool virtualCall, void* thisArg, CORINFO_RESOLVED_T // This is the argument slot that will be used to hold the return value. // In UNIX_AMD64_ABI, return type may have need tow ARG_SLOTs. ARG_SLOT retVals[2] = {0, 0}; -#if !defined(HOST_ARM) && !defined(UNIX_AMD64_ABI) +#if !defined(HOST_ARM) && !defined(UNIX_AMD64_ABI) && !defined(TARGET_RISCV64) _ASSERTE (NUMBER_RETURNVALUE_SLOTS == 1); #endif @@ -9968,7 +10012,7 @@ void Interpreter::DoCallWork(bool virtualCall, void* thisArg, CORINFO_RESOLVED_T bool b = CycleTimer::GetThreadCyclesS(&startCycles); _ASSERTE(b); #endif // INTERP_ILCYCLE_PROFILE -#if defined(UNIX_AMD64_ABI) +#if defined(UNIX_AMD64_ABI) || defined(TARGET_RISCV64) mdcs.CallTargetWorker(args, retVals, HasTwoSlotBuf ? 16: 8); #else mdcs.CallTargetWorker(args, retVals, 8); @@ -10114,7 +10158,7 @@ void Interpreter::DoCallWork(bool virtualCall, void* thisArg, CORINFO_RESOLVED_T { OpStackSet(m_curStackHt, GetSmallStructValue(&smallStructRetVal, retTypeSz)); } -#if defined(UNIX_AMD64_ABI) +#if defined(UNIX_AMD64_ABI) || defined(TARGET_RISCV64) else if (HasTwoSlotBuf) { void* dst = LargeStructOperandStackPush(16); diff --git a/src/coreclr/vm/riscv64/cgencpu.h b/src/coreclr/vm/riscv64/cgencpu.h index 2549ec22e60bc..13b20a588b71a 100644 --- a/src/coreclr/vm/riscv64/cgencpu.h +++ b/src/coreclr/vm/riscv64/cgencpu.h @@ -365,7 +365,6 @@ class StubLinkerCPU : public StubLinker void EmitComputedInstantiatingMethodStub(MethodDesc* pSharedMD, struct ShuffleEntry *pShuffleEntryArray, void* extraArg); #endif // FEATURE_SHARE_GENERIC_CODE -private: void EmitMovConstant(IntReg target, UINT64 constant); void EmitJumpRegister(IntReg regTarget); void EmitMovReg(IntReg dest, IntReg source); @@ -380,6 +379,9 @@ class StubLinkerCPU : public StubLinker void EmitLoad(FloatReg dest, IntReg srcAddr, int offset = 0); void EmitStore(IntReg src, IntReg destAddr, int offset = 0); void EmitStore(FloatReg src, IntReg destAddr, int offset = 0); + + void EmitProlog(unsigned short cIntRegArgs, unsigned short cFpRegArgs, unsigned short cbStackSpace = 0); + void EmitEpilog(); }; extern "C" void SinglecastDelegateInvokeStub(); diff --git a/src/coreclr/vm/riscv64/stubs.cpp b/src/coreclr/vm/riscv64/stubs.cpp index dd4c11c8a0f11..c1021474634cd 100644 --- a/src/coreclr/vm/riscv64/stubs.cpp +++ b/src/coreclr/vm/riscv64/stubs.cpp @@ -1104,6 +1104,116 @@ void StubLinkerCPU::EmitJumpRegister(IntReg regTarget) Emit32(0x00000067 | (regTarget << 15)); } +void StubLinkerCPU::EmitProlog(unsigned short cIntRegArgs, unsigned short cFpRegArgs, unsigned short cbStackSpace) +{ + _ASSERTE(!m_fProlog); + + unsigned short numberOfEntriesOnStack = 2 + cIntRegArgs + cFpRegArgs; // 2 for fp, ra + + // Stack needs to be 16 byte aligned. Compute the required padding before saving it + unsigned short totalPaddedFrameSize = static_cast(ALIGN_UP(cbStackSpace + numberOfEntriesOnStack * sizeof(void*), 2 * sizeof(void*))); + // The padding is going to be applied to the local stack + cbStackSpace = totalPaddedFrameSize - numberOfEntriesOnStack * sizeof(void*); + + // Record the parameters of this prolog so that we can generate a matching epilog and unwind info. + DescribeProlog(cIntRegArgs, cFpRegArgs, cbStackSpace); + + + // N.B Despite the range of a jump with a sub sp is 4KB, we're limiting to 504 to save from emitting right prolog that's + // expressable in unwind codes efficiently. The largest offset in typical unwindinfo encodings that we use is 504. + // so allocations larger than 504 bytes would require setting the SP in multiple strides, which would complicate both + // prolog and epilog generation as well as unwindinfo generation. + _ASSERTE((totalPaddedFrameSize <= 504) && "NYI:RISCV64 Implement StubLinker prologs with larger than 504 bytes of frame size"); + if (totalPaddedFrameSize > 504) + COMPlusThrow(kNotSupportedException); + + // Here is how the stack would look like (Stack grows up) + // [Low Address] + // +------------+ + // SP -> | | <-+ + // : : | Stack Frame, (i.e outgoing arguments) including padding + // | | <-+ + // +------------+ + // | FP | + // +------------+ + // | RA | + // +------------+ + // | F10 | <-+ + // +------------+ | + // : : | Fp Args + // +------------+ | + // | F17 | <-+ + // +------------+ + // | X10 | <-+ + // +------------+ | + // : : | Int Args + // +------------+ | + // | X17 | <-+ + // +------------+ + // Old SP -> |[Stack Args]| + // [High Address] + + // Regarding the order of operations in the prolog and epilog; + // If the prolog and the epilog matches each other we can simplify emitting the unwind codes and save a few + // bytes of unwind codes by making prolog and epilog share the same unwind codes. + // In order to do that we need to make the epilog be the reverse of the prolog. + // But we wouldn't want to add restoring of the argument registers as that's completely unnecessary. + // Besides, saving argument registers cannot be expressed by the unwind code encodings. + // So, we'll push saving the argument registers to the very last in the prolog, skip restoring it in epilog, + // and also skip reporting it to the OS. + // + // Another bit that we can save is resetting the frame pointer. + // This is not necessary when the SP doesn't get modified beyond prolog and epilog. (i.e no alloca/localloc) + // And in that case we don't need to report setting up the FP either. + + // 1. Relocate SP + EmitSubImm(RegSp, RegSp, totalPaddedFrameSize); + + unsigned cbOffset = 2 * sizeof(void*) + cbStackSpace; // 2 is for fp, ra + + // 2. Store FP/RA + EmitStore(RegFp, RegSp, cbStackSpace); + EmitStore(RegRa, RegSp, cbStackSpace + sizeof(void*)); + + // 3. Set the frame pointer + EmitMovReg(RegFp, RegSp); + + // 4. Store floating point argument registers + _ASSERTE(cFpRegArgs <= 8); + for (unsigned short i = 0; i < cFpRegArgs; i++) + EmitStore(FloatReg(i + 10), RegSp, cbOffset + i * sizeof(void*)); + + // 5. Store int argument registers + cbOffset += cFpRegArgs * sizeof(void*); + _ASSERTE(cIntRegArgs <= 8); + for (unsigned short i = 0 ; i < cIntRegArgs; i++) + EmitStore(IntReg(i + 10), RegSp, cbOffset + i * sizeof(void*)); +} + +void StubLinkerCPU::EmitEpilog() +{ + _ASSERTE(m_fProlog); + + // 5. Restore int argument registers + // nop: We don't need to. They are scratch registers + + // 4. Restore floating point argument registers + // nop: We don't need to. They are scratch registers + + // 3. Restore the SP from FP + // N.B. We're assuming that the stublinker stubs doesn't do alloca, hence nop + + // 2. Restore FP/RA + EmitLoad(RegFp, RegSp, m_cbStackSpace); + EmitLoad(RegRa, RegSp, m_cbStackSpace + sizeof(void*)); + + // 1. Restore SP + EmitAddImm(RegSp, RegSp, GetStackFrameSize()); + + // jalr x0, 0(ra) + EmitJumpRegister(1); +} + // Instruction types as per RISC-V Spec, Chapter 24 RV32/64G Instruction Set Listings static unsigned ITypeInstr(unsigned opcode, unsigned funct3, unsigned rd, unsigned rs1, int imm12) { diff --git a/src/coreclr/vm/stublink.cpp b/src/coreclr/vm/stublink.cpp index bd66bad8cd630..7bdf203faad8f 100644 --- a/src/coreclr/vm/stublink.cpp +++ b/src/coreclr/vm/stublink.cpp @@ -360,6 +360,12 @@ StubLinker::StubLinker() m_cbStackFrame = 0; m_fPushArgRegs = FALSE; #endif +#ifdef TARGET_RISCV64 + m_fProlog = FALSE; + m_cIntRegArgs = 0; + m_cFpRegArgs = 0; + m_cbStackSpace = 0; +#endif #ifdef STUBLINKER_GENERATES_UNWIND_INFO #ifdef _DEBUG m_pUnwindInfoCheckLabel = NULL; @@ -1891,7 +1897,30 @@ UINT StubLinker::GetStackFrameSize() return m_cbStackSpace + (2 + m_cCalleeSavedRegs + m_cIntRegArgs + m_cVecRegArgs)*sizeof(void*); } -#endif // ifdef TARGET_ARM, elif defined(TARGET_ARM64) +#elif defined(TARGET_RISCV64) +void StubLinker::DescribeProlog(UINT cIntRegArgs, UINT cFpRegArgs, UINT cbStackSpace) +{ + m_fProlog = TRUE; + m_cIntRegArgs = cIntRegArgs; + m_cFpRegArgs = cFpRegArgs; + m_cbStackSpace = cbStackSpace; +} + +UINT StubLinker::GetSavedRegArgsOffset() +{ + _ASSERTE(m_fProlog); + // This is the offset from SP + // We're assuming that the stublinker will push the arg registers to the bottom of the stack frame + return m_cbStackSpace + 2 * sizeof(void*); // 2 is for FP and LR +} + +UINT StubLinker::GetStackFrameSize() +{ + _ASSERTE(m_fProlog); + return m_cbStackSpace + (2 + m_cIntRegArgs + m_cFpRegArgs) * sizeof(void*); +} + +#endif // ifdef TARGET_ARM, elif defined(TARGET_ARM64), elif defined(TARGET_RISCV64) #endif // #ifndef DACCESS_COMPILE diff --git a/src/coreclr/vm/stublink.h b/src/coreclr/vm/stublink.h index bd926001ae719..8189fac7a36af 100644 --- a/src/coreclr/vm/stublink.h +++ b/src/coreclr/vm/stublink.h @@ -229,6 +229,10 @@ class StubLinker void DescribeProlog(UINT cIntRegArgs, UINT cVecRegArgs, UINT cCalleeSavedRegs, UINT cbStackFrame); UINT GetSavedRegArgsOffset(); UINT GetStackFrameSize(); +#elif defined(TARGET_RISCV64) + void DescribeProlog(UINT cIntRegArgs, UINT cVecRegArgs, UINT cbStackFrame); + UINT GetSavedRegArgsOffset(); + UINT GetStackFrameSize(); #endif //=========================================================================== @@ -304,6 +308,14 @@ class StubLinker UINT m_cbStackSpace; // Additional stack space for return buffer and stack alignment #endif // TARGET_ARM64 +#ifdef TARGET_RISCV64 +protected: + BOOL m_fProlog; // True if DescribeProlog has been called + UINT m_cIntRegArgs; // Count of int register arguments (x10 - x17) + UINT m_cFpRegArgs; // Count of FP register arguments (f10 - f17) + UINT m_cbStackSpace; // Additional stack space for return buffer and stack alignment +#endif // TARGET_RISCV64 + #ifdef STUBLINKER_GENERATES_UNWIND_INFO #ifdef _DEBUG From 4b9df420c94694f982a5344cb24b4b8cb624922c Mon Sep 17 00:00:00 2001 From: "j-h.choi" Date: Fri, 10 Nov 2023 10:29:56 +0900 Subject: [PATCH 2/2] Code review feedback --- src/coreclr/vm/riscv64/stubs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/vm/riscv64/stubs.cpp b/src/coreclr/vm/riscv64/stubs.cpp index c1021474634cd..a308af61a8768 100644 --- a/src/coreclr/vm/riscv64/stubs.cpp +++ b/src/coreclr/vm/riscv64/stubs.cpp @@ -1211,7 +1211,7 @@ void StubLinkerCPU::EmitEpilog() EmitAddImm(RegSp, RegSp, GetStackFrameSize()); // jalr x0, 0(ra) - EmitJumpRegister(1); + EmitJumpRegister(RegRa); } // Instruction types as per RISC-V Spec, Chapter 24 RV32/64G Instruction Set Listings