Skip to content

Commit

Permalink
fix: LDP/STP forwarding #90
Browse files Browse the repository at this point in the history
*Description*
LDP and STP instructions load or store to 2 memory locations. It is
normally implemented as two uOps in hardware, but it is a single uOp in
flexus. This causes some unintended bugs where store-load forwarding
breaks etc.

*Steps to Reproduce*
Run data caching image with 2 coresAfter some time, there will be a
sequence of instructions as follows:str to address x + 8ldp to address x

*Expected Behavior*
Load pair should get a part of its value from str because the second
load value of the pair address matches the str address.

*Actual Behavior*
It does not forward because the addresses x and x+8 are not the same.
  • Loading branch information
BugraEryilmaz authored and branylagaffe committed Dec 17, 2024
1 parent 5f42f1d commit 0fa2b1c
Show file tree
Hide file tree
Showing 7 changed files with 86 additions and 41 deletions.
3 changes: 2 additions & 1 deletion components/Decoder/Instruction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ ArchInstruction::doDispatchEffects()
auto bp_state = bpState();

DBG_Assert(bp_state, (<< "No branch predictor state exists, but it must"));
if (isMicroOp()) return;
if (bp_state->thePredictedType == kNonBranch) return;
if (isBranch()) return;

Expand Down Expand Up @@ -161,7 +162,7 @@ decode(Flexus::SharedTypes::FetchedOpcode const& aFetchedOpcode, uint32_t aCPU,
DBG_(VVerb, (<< "\033[1;31m DECODER: Decoding " << std::hex << aFetchedOpcode.theOpcode << std::dec << "\033[0m"));

bool last_uop = true;
boost::intrusive_ptr<AbstractInstruction> ret_val = disas_a64_insn(aFetchedOpcode, aCPU, aSequenceNo, aUop);
boost::intrusive_ptr<AbstractInstruction> ret_val = disas_a64_insn(aFetchedOpcode, aCPU, aSequenceNo, aUop, last_uop);
return std::make_pair(ret_val, last_uop);
}

Expand Down
4 changes: 2 additions & 2 deletions components/Decoder/encodings/Encodings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ namespace nDecoder {

/* C3.1 A64 instruction index by encoding */
archinst
disas_a64_insn(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo, int32_t aUop)
disas_a64_insn(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo, int32_t aUop, bool &aLastUop)
{
if (aFetchedOpcode.theOpcode == 1) { // instruction fetch page fault
return blackBox(aFetchedOpcode, aCPU, aSequenceNo);
Expand All @@ -27,7 +27,7 @@ disas_a64_insn(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceN
case 0x4:
case 0x6:
case 0xc:
case 0xe: /* Loads and stores */ return disas_ldst(aFetchedOpcode, aCPU, aSequenceNo);
case 0xe: /* Loads and stores */ return disas_ldst(aFetchedOpcode, aCPU, aSequenceNo, aUop, aLastUop);
case 0x5:
case 0xd: /* Data processing - register */ return disas_data_proc_reg(aFetchedOpcode, aCPU, aSequenceNo);
case 0x7:
Expand Down
6 changes: 3 additions & 3 deletions components/Decoder/encodings/Encodings.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,13 @@ disas_ldst_reg_unsigned_imm(archcode const& aFetchedOpcode, uint32_t aCPU, int64
archinst
disas_ldst_reg(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo);
archinst
disas_ldst_pair(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo);
disas_ldst_pair(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo, int32_t aUop, bool& aLastUop);
archinst
disas_ld_lit(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo);
archinst
disas_ldst_excl(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo);
archinst
disas_ldst(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo);
disas_ldst(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo, int32_t aUop, bool& aLastUop);

//<<--Data Processing -- Register
archinst
Expand Down Expand Up @@ -120,7 +120,7 @@ disas_data_proc_simd_fp(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t a

/* C3.1 A64 instruction index by encoding */
archinst
disas_a64_insn(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo, int32_t aUop);
disas_a64_insn(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo, int32_t aUop, bool& aLastUop);

} // namespace nDecoder

Expand Down
14 changes: 8 additions & 6 deletions components/Decoder/encodings/Encodings_LoadStore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -300,10 +300,12 @@ disas_ldst_reg(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceN
* imm7 = signed offset (multiple of 4 or 8 depending on size)
*/
archinst
disas_ldst_pair(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo)
disas_ldst_pair(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo, int32_t aUop, bool& aLastUop)
{
DECODER_TRACE;

aLastUop = !(aUop == 0);

bool is_vector = extract32(aFetchedOpcode.theOpcode, 26, 1);
bool is_load = extract32(aFetchedOpcode.theOpcode, 22, 1);

Expand All @@ -320,9 +322,9 @@ disas_ldst_pair(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequence
// }
} else {
if (is_load) {
return LDP(aFetchedOpcode, aCPU, aSequenceNo);
return LDP(aFetchedOpcode, aCPU, aSequenceNo, aUop);
} else {
return STP(aFetchedOpcode, aCPU, aSequenceNo);
return STP(aFetchedOpcode, aCPU, aSequenceNo, aUop);
}
}
}
Expand Down Expand Up @@ -403,7 +405,7 @@ disas_ldst_excl(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequence

/* Loads and stores */
archinst
disas_ldst(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo)
disas_ldst(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo, int32_t aUop, bool& aLastUop)
{
switch (extract32(aFetchedOpcode.theOpcode, 24, 6)) {
case 0x08: /* Load/store exclusive */ return disas_ldst_excl(aFetchedOpcode, aCPU, aSequenceNo);
Expand All @@ -412,7 +414,7 @@ disas_ldst(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo)
case 0x28:
case 0x29:
case 0x2c:
case 0x2d: /* Load/store pair (all forms) */ return disas_ldst_pair(aFetchedOpcode, aCPU, aSequenceNo);
case 0x2d: /* Load/store pair (all forms) */ return disas_ldst_pair(aFetchedOpcode, aCPU, aSequenceNo, aUop, aLastUop);
case 0x38:
case 0x39:
case 0x3c:
Expand All @@ -425,4 +427,4 @@ disas_ldst(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo)
}
}

} // namespace nDecoder
} // namespace nDecoder
90 changes: 65 additions & 25 deletions components/Decoder/encodings/LoadStore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,7 @@ LDR_lit(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo)
}
// Load/store pair (all forms)
archinst
LDP(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo)
LDP(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo, int32_t aUop)
{
DECODER_TRACE;
uint32_t opc = extract32(aFetchedOpcode.theOpcode, 30, 2);
Expand All @@ -453,7 +453,9 @@ LDP(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo)
*/
bool is_signed = (opc & 0x1) == 0x1;
uint32_t size = 8 << (scale + 1);
eSize sz = dbSize(size);
auto sizeOfEachPair = size / 2;
auto sizeOfEachPairInBytes = sizeOfEachPair / 8;
eSize sz = dbSize(size / 2);

if ((((opc & 1) == 1) && L == 0) || opc == 3 || (is_signed && index == kNoOffset)) {
/* Msutherl: if signed, only valid encodings are preindex, postindex, imm-index */
Expand All @@ -471,21 +473,34 @@ LDP(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo)
aFetchedOpcode.theBPState,
aCPU,
aSequenceNo));
if (aUop == 0) {
inst->setIsMicroOp(true);
}

inst->setClass(clsLoad, codeLDP);

eAccType acctype = kAccType_NORMAL;
std::vector<std::list<InternalDependance>> addr_deps(1);

// calculate the address from rn
simple_action act = addAddressCompute(inst, addr_deps);
simple_action act;
if (aUop == 0) {
// first uop computes the address using rn and offset
act = addAddressCompute(inst, addr_deps);
} else {
// second uop also adds the size of each pair to the address
addr_deps.resize(2);
act = addAddressCompute(inst, addr_deps);
addReadConstant(inst, 2, sizeOfEachPairInBytes, addr_deps[1]);
}
addReadXRegister(inst, 1, rn, addr_deps[0], true);

if (index == kPreIndex || index == kPostIndex) {
if ((index == kPreIndex || index == kPostIndex) && aUop == 1) {
// C6.2.129 LDP PostIndex and PreIndex cause writeback
// Writeback only happens in the second uop
std::vector<std::list<InternalDependance>> wb_deps(1);
predicated_action wback = addExecute(inst, operation(kADD_), { kAddress, kOperand4 }, wb_deps, kResult2);
addReadConstant(inst, 4, ((index == kPostIndex) ? imm7 : 0), wb_deps[0]);
addReadConstant(inst, 4, ((index == kPostIndex) ? imm7 - sizeOfEachPairInBytes : -sizeOfEachPairInBytes), wb_deps[0]);
// wback.action->addDependance(wback.action->dependance(1));
connectDependance(wback.action->dependance(1), act);
// wb_deps[0].push_back(act.action->dependance(0));
Expand All @@ -508,7 +523,21 @@ LDP(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo)
}

predicated_dependant_action load;
load = ldpAction(inst, sz, is_signed ? kSignExtend : kNoExtension, kPD, kPD1);
if (aUop == 0) {
if (rt != 31) {
load = loadAction(inst, sz, is_signed ? kSignExtend : kNoExtension, kPD);
addDestination(inst, rt, load, size / 2 == 64);
} else {
load = loadAction(inst, sz, is_signed ? kSignExtend : kNoExtension, boost::none);
}
} else {
if (rt2 != 31) {
load = loadAction(inst, sz, is_signed ? kSignExtend : kNoExtension, kPD);
addDestination(inst, rt2, load, size / 2 == 64);
} else {
load = loadAction(inst, sz, is_signed ? kSignExtend : kNoExtension, boost::none);
}
}

inst->addDispatchEffect(allocateLoad(inst, sz, load.dependance, acctype));
inst->addCheckTrapEffect(mmuPageFaultCheck(inst));
Expand All @@ -517,12 +546,10 @@ LDP(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo)
inst->addSquashEffect(eraseLSQ(inst));
inst->addRetirementConstraint(loadMemoryConstraint(inst));

addPairDestination(inst, rt, rt2, load, size / 2 == 64);

return inst;
}
archinst
STP(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo)
STP(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo, int32_t aUop)
{
DECODER_TRACE;
uint32_t opc = extract32(aFetchedOpcode.theOpcode, 30, 2);
Expand All @@ -535,7 +562,9 @@ STP(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo)
uint32_t scale = 2 + extract32(opc, 1, 1);
// bool is_signed = ( opc & 1 ) != 0;
int size = 8 << (scale + 1);
eSize sz = dbSize(size);
auto sizeOfEachPair = size / 2;
auto sizeOfEachPairInBytes = sizeOfEachPair / 8;
eSize sz = dbSize(size / 2);

if ((((opc & 1) == 1) && L == 0) || opc == 3) { return unallocated_encoding(aFetchedOpcode, aCPU, aSequenceNo); }

Expand All @@ -548,20 +577,33 @@ STP(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo)
aSequenceNo));

inst->setClass(clsStore, codeStore);
if (aUop == 0) {
inst->setIsMicroOp(true);
}

// calculate the address from rn
eAccType acctype = kAccType_STREAM;
std::vector<std::list<InternalDependance>> addr_deps(1), data_deps(2);
simple_action addr = addAddressCompute(inst, addr_deps);
std::vector<std::list<InternalDependance>> addr_deps(1), data_deps(1);

simple_action addr;
if (aUop == 0) {
// first uop computes the address using rn and offset
addr = addAddressCompute(inst, addr_deps);
} else {
// second uop also adds the size of each pair to the address
addr_deps.resize(2);
addr = addAddressCompute(inst, addr_deps);
addReadConstant(inst, 2, sizeOfEachPairInBytes, addr_deps[1]);
}
addReadXRegister(inst, 1, rn, addr_deps[0], true);

if (index == kPreIndex || index == kPostIndex) {
if ((index == kPreIndex || index == kPostIndex) && aUop == 1) {
// C6.2.273 STP PostIndex and PreIndex cause writeback
// Writeback only happens in the second uop
std::vector<std::list<InternalDependance>> wb_deps(2);
predicated_action wback = addExecute(inst, operation(kADD_), { kAddress, kOperand4 }, wb_deps, kResult1);
connect(wb_deps[1], addr);
addReadConstant(inst, 4, ((index == kPostIndex) ? imm7 : 0), wb_deps[0]);
addReadConstant(inst, 4, ((index == kPostIndex) ? imm7 - sizeOfEachPairInBytes : - sizeOfEachPairInBytes), wb_deps[0]);
addDestination1(inst, rn, wback, size / 2 == 64);
}

Expand All @@ -575,28 +617,26 @@ STP(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo)
DBG_(VVerb, (<< "setting signed offset #" << imm7));
}
}

predicated_dependant_action update_value = updateStoreValueAction(inst, kOperand5);
data_deps[0].push_back(update_value.dependance);
connectDependance(inst->retirementDependance(), update_value);
// read data registers
simple_action act =
addExecute(inst, operation(size / 2 == 64 ? kCONCAT64_ : kCONCAT32_), { kOperand2, kOperand3 }, data_deps);
readRegister(inst, 2, rt2, data_deps[0], size / 2 == 64);
readRegister(inst, 3, rt, data_deps[1], size / 2 == 64);
if (aUop == 0) {
readRegister(inst, 5, rt, data_deps[0], size / 2 == 64);
} else {
readRegister(inst, 5, rt2, data_deps[0], size / 2 == 64);
}

inst->addDispatchEffect(allocateStore(inst, sz, false, acctype));
inst->addCheckTrapEffect(mmuPageFaultCheck(inst));
inst->addRetirementConstraint(storeQueueAvailableConstraint(inst));
inst->addRetirementConstraint(sideEffectStoreConstraint(inst));

multiply_dependant_action update_value = updateSTPValueAction(inst, kResult);
inst->addDispatchEffect(satisfy(inst, update_value.dependances[1]));
connectDependance(update_value.dependances[0], act);
connectDependance(inst->retirementDependance(), update_value);

inst->addRetirementEffect(retireMem(inst));
inst->addCommitEffect(commitStore(inst));
inst->addSquashEffect(eraseLSQ(inst));

inst->addPostvalidation(validateMemory(kAddress, kResult, sz, inst));
inst->addPostvalidation(validateMemory(kAddress, kOperand5, sz, inst));

return inst;
}
Expand Down
4 changes: 2 additions & 2 deletions components/Decoder/encodings/LoadStore.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ LDR_lit(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo);

// Load/store pair (all forms)
archinst
LDP(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo);
LDP(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo, int32_t aUop);
archinst
STP(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo);
STP(archcode const& aFetchedOpcode, uint32_t aCPU, int64_t aSequenceNo, int32_t aUop);

/* Load/store register (all forms) */
archinst
Expand Down
6 changes: 4 additions & 2 deletions components/uArch/CoreModel/cycle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1446,9 +1446,11 @@ CoreImpl::commit(boost::intrusive_ptr<Instruction> anInstruction)
throw ResynchronizeWithQemuException(true);
}

validation_passed &= checkValidatation();
if (anInstruction->advancesSimics()) {
validation_passed &= checkValidatation();
validation_passed &= anInstruction->postValidate();
}

validation_passed &= anInstruction->postValidate();
DBG_(Iface, (<< "Post Validating... " << validation_passed));

if (!validation_passed) {
Expand Down

0 comments on commit 0fa2b1c

Please sign in to comment.