From 6b25c3ba1a349b22e70c5418b5b05e545e1e2271 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Tue, 3 Oct 2023 14:08:40 +0200 Subject: [PATCH] JIT: Merge consecutive stores (#92852) Co-authored-by: Egor Co-authored-by: Jakob Botsch Nielsen --- src/coreclr/jit/lower.cpp | 280 ++++++++++++++++++++++++++++++++++++++ src/coreclr/jit/lower.h | 1 + 2 files changed, 281 insertions(+) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 4b9d56e240c46..7bc73d6bcecf0 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -7802,6 +7802,285 @@ void Lowering::ContainCheckBitCast(GenTree* node) } } +struct StoreCoalescingData +{ + var_types targetType; + GenTree* baseAddr; + GenTree* index; + GenTree* value; + uint32_t scale; + int offset; +}; + +//------------------------------------------------------------------------ +// GetStoreCoalescingData: given a STOREIND node, get the data needed to perform +// store coalescing including pointer to the previous node. +// +// Arguments: +// comp - the compiler instance +// ind - the STOREIND node +// data - [OUT] the data needed for store coalescing +// +// Return Value: +// true if the data was successfully retrieved, false otherwise. +// Basically, false means that we definitely can't do store coalescing. +// +static bool GetStoreCoalescingData(Compiler* comp, GenTreeStoreInd* ind, StoreCoalescingData* data) +{ + // Don't merge volatile stores. + if (ind->IsVolatile()) + { + return false; + } + + // Data has to be INT_CNS, can be also VEC_CNS in future. + if (!ind->Data()->IsCnsIntOrI()) + { + return false; + } + + data->targetType = ind->TypeGet(); + data->value = ind->Data(); + if (ind->Addr()->OperIs(GT_LEA)) + { + GenTree* base = ind->Addr()->AsAddrMode()->Base(); + GenTree* index = ind->Addr()->AsAddrMode()->Index(); + if ((base == nullptr) || !base->OperIs(GT_LCL_VAR) || comp->lvaVarAddrExposed(base->AsLclVar()->GetLclNum())) + { + // Base must be a local. It's possible for it to be nullptr when index is not null, + // but let's ignore such cases. + return false; + } + + if ((index != nullptr) && + (!index->OperIs(GT_LCL_VAR) || comp->lvaVarAddrExposed(index->AsLclVar()->GetLclNum()))) + { + // Index should be either nullptr or a local. + return false; + } + + data->baseAddr = base == nullptr ? nullptr : base; + data->index = index == nullptr ? nullptr : index; + data->scale = ind->Addr()->AsAddrMode()->GetScale(); + data->offset = ind->Addr()->AsAddrMode()->Offset(); + } + else if (ind->Addr()->OperIs(GT_LCL_VAR) && !comp->lvaVarAddrExposed(ind->Addr()->AsLclVar()->GetLclNum())) + { + // Address is just a local, no offset, scale is 1 + data->baseAddr = ind->Addr(); + data->index = nullptr; + data->scale = 1; + data->offset = 0; + } + else + { + // Address is not LEA or local. + return false; + } + return true; +} + +//------------------------------------------------------------------------ +// LowerStoreIndirCoalescing: If the given STOREIND node is followed by a similar +// STOREIND node, try to merge them into a single store of a twice wider type. Example: +// +// * STOREIND int +// +--* LCL_VAR byref V00 +// \--* CNS_INT int 0x1 +// +// * STOREIND int +// +--* LEA(b+4) byref +// | \--* LCL_VAR byref V00 +// \--* CNS_INT int 0x2 +// +// We can merge these two into into a single store of 8 bytes with (0x1 | (0x2 << 32)) as the value +// +// * STOREIND long +// +--* LEA(b+0) byref +// | \--* LCL_VAR byref V00 +// \--* CNS_INT long 0x200000001 +// +// Arguments: +// ind - the current STOREIND node +// +void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind) +{ +// LA, RISC-V and ARM32 more likely to recieve a terrible performance hit from +// unaligned accesses making this optimization questionable. +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) + if (!comp->opts.OptimizationEnabled()) + { + return; + } + + // For now, we require the current STOREIND to have LEA (previous store may not have it) + // So we can easily adjust the offset, consider making it more flexible in future. + if (!ind->Addr()->OperIs(GT_LEA)) + { + return; + } + + // We're going to do it in a loop while we see suitable STOREINDs to coalesce. + // E.g.: we have the following LIR sequence: + // + // ...addr nodes... + // STOREIND(int) + // ...addr nodes... + // STOREIND(short) + // ...addr nodes... + // STOREIND(short) <-- we're here + // + // First we merge two 'short' stores, then we merge the result with the 'int' store + // to get a single store of 8 bytes. + do + { + // This check is not really needed, just for better throughput. + if (!ind->TypeIs(TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT)) + { + return; + } + + StoreCoalescingData currData; + StoreCoalescingData prevData; + + // Get coalescing data for the current STOREIND + if (!GetStoreCoalescingData(comp, ind, &currData)) + { + return; + } + + bool isClosedRange = false; + // Now we need to find the very first LIR node representing the current STOREIND + // and make sure that there are no other unexpected nodes in-between. + LIR::ReadOnlyRange currIndRange = BlockRange().GetTreeRange(ind, &isClosedRange); + if (!isClosedRange) + { + return; + } + GenTree* prevTree = currIndRange.FirstNode()->gtPrev; + // Now we need to find the previous STOREIND, + // we can ignore any NOPs or IL_OFFSETs in-between + while ((prevTree != nullptr) && prevTree->OperIs(GT_NOP, GT_IL_OFFSET)) + { + prevTree = prevTree->gtPrev; + } + + // It's not a STOREIND - bail out. + if ((prevTree == nullptr) || !prevTree->OperIs(GT_STOREIND)) + { + return; + } + + // Get coalescing data for the previous STOREIND + GenTreeStoreInd* prevInd = prevTree->AsStoreInd(); + if (!GetStoreCoalescingData(comp, prevInd->AsStoreInd(), &prevData)) + { + return; + } + + // Same for the previous STOREIND, make sure there are no unexpected nodes around. + LIR::ReadOnlyRange prevIndRange = BlockRange().GetTreeRange(prevInd, &isClosedRange); + if (!isClosedRange) + { + return; + } + + // STOREIND aren't value nodes. + LIR::Use use; + assert(!BlockRange().TryGetUse(prevInd, &use) && !BlockRange().TryGetUse(ind, &use)); + + // BaseAddr, Index, Scale and Type all have to match. + if ((prevData.scale != currData.scale) || (prevData.targetType != currData.targetType) || + !GenTree::Compare(prevData.baseAddr, currData.baseAddr) || + !GenTree::Compare(prevData.index, currData.index)) + { + return; + } + + // Offset has to match the size of the type. We don't support the same or overlapping offsets. + if (abs(prevData.offset - currData.offset) != (int)genTypeSize(prevData.targetType)) + { + return; + } + + // Since we're merging two stores of the same type, the new type is twice wider. + var_types oldType = ind->TypeGet(); + var_types newType; + switch (oldType) + { + case TYP_BYTE: + case TYP_UBYTE: + newType = TYP_USHORT; + break; + + case TYP_SHORT: + case TYP_USHORT: + newType = TYP_INT; // TYP_UINT is not legal in IR + break; + +#ifdef TARGET_64BIT + case TYP_INT: + newType = TYP_LONG; + break; +#endif // TARGET_64BIT + + // TYP_FLOAT and TYP_DOUBLE aren't needed here - they're expected to + // be converted to TYP_INT/TYP_LONG for constant value. + // + // TODO-CQ: + // 2 x LONG/REF -> SIMD16 + // 2 x SIMD16 -> SIMD32 + // 2 x SIMD32 -> SIMD64 + // + // where it's legal (e.g. SIMD is not atomic on x64) + // + default: + return; + } + + // Delete previous STOREIND entirely + BlockRange().Remove(std::move(prevIndRange)); + + // We know it's always LEA for now + GenTreeAddrMode* addr = ind->Addr()->AsAddrMode(); + + // Update offset to be the minimum of the two + addr->SetOffset(min(prevData.offset, currData.offset)); + + // Update type for both STOREIND and val + ind->gtType = newType; + ind->Data()->gtType = newType; + + // We currently only support these constants for val + assert(prevData.value->IsCnsIntOrI() && currData.value->IsCnsIntOrI()); + + size_t lowerCns = (size_t)prevData.value->AsIntCon()->IconValue(); + size_t upperCns = (size_t)currData.value->AsIntCon()->IconValue(); + + // if the previous store was at a higher address, swap the constants + if (prevData.offset > currData.offset) + { + std::swap(lowerCns, upperCns); + } + + // Trim the constants to the size of the type, e.g. for TYP_SHORT and TYP_USHORT + // the mask will be 0xFFFF, for TYP_INT - 0xFFFFFFFF. + size_t mask = ~(size_t(0)) >> (sizeof(size_t) - genTypeSize(oldType)) * BITS_IN_BYTE; + lowerCns &= mask; + upperCns &= mask; + + size_t val = (lowerCns | (upperCns << (genTypeSize(oldType) * BITS_IN_BYTE))); + JITDUMP("Coalesced two stores into a single store with value %lld\n", (int64_t)val); + + // It's not expected to be contained yet, but just in case... + ind->Data()->ClearContained(); + ind->Data()->AsIntCon()->gtIconVal = (ssize_t)val; + ind->gtFlags |= GTF_IND_UNALIGNED; + + } while (true); +#endif // TARGET_XARCH || TARGET_ARM64 +} + //------------------------------------------------------------------------ // LowerStoreIndirCommon: a common logic to lower StoreIndir. // @@ -7842,6 +8121,7 @@ void Lowering::LowerStoreIndirCommon(GenTreeStoreInd* ind) } #endif + LowerStoreIndirCoalescing(ind); LowerStoreIndir(ind); } } diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 2ac01306a16b5..8ffe86c9a51b1 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -312,6 +312,7 @@ class Lowering final : public Phase void LowerStoreIndirCommon(GenTreeStoreInd* ind); void LowerIndir(GenTreeIndir* ind); void LowerStoreIndir(GenTreeStoreInd* node); + void LowerStoreIndirCoalescing(GenTreeStoreInd* node); GenTree* LowerAdd(GenTreeOp* node); GenTree* LowerMul(GenTreeOp* mul); bool TryLowerAndNegativeOne(GenTreeOp* node, GenTree** nextNode);