Skip to content

Commit

Permalink
JIT: Merge consecutive stores (dotnet#92852)
Browse files Browse the repository at this point in the history
Co-authored-by: Egor <[email protected]>
Co-authored-by: Jakob Botsch Nielsen <[email protected]>
  • Loading branch information
3 people authored Oct 3, 2023
1 parent 7abea9e commit 6b25c3b
Show file tree
Hide file tree
Showing 2 changed files with 281 additions and 0 deletions.
280 changes: 280 additions & 0 deletions src/coreclr/jit/lower.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7802,6 +7802,285 @@ void Lowering::ContainCheckBitCast(GenTree* node)
}
}

struct StoreCoalescingData
{
var_types targetType;
GenTree* baseAddr;
GenTree* index;
GenTree* value;
uint32_t scale;
int offset;
};

//------------------------------------------------------------------------
// GetStoreCoalescingData: given a STOREIND node, get the data needed to perform
// store coalescing including pointer to the previous node.
//
// Arguments:
// comp - the compiler instance
// ind - the STOREIND node
// data - [OUT] the data needed for store coalescing
//
// Return Value:
// true if the data was successfully retrieved, false otherwise.
// Basically, false means that we definitely can't do store coalescing.
//
static bool GetStoreCoalescingData(Compiler* comp, GenTreeStoreInd* ind, StoreCoalescingData* data)
{
// Don't merge volatile stores.
if (ind->IsVolatile())
{
return false;
}

// Data has to be INT_CNS, can be also VEC_CNS in future.
if (!ind->Data()->IsCnsIntOrI())
{
return false;
}

data->targetType = ind->TypeGet();
data->value = ind->Data();
if (ind->Addr()->OperIs(GT_LEA))
{
GenTree* base = ind->Addr()->AsAddrMode()->Base();
GenTree* index = ind->Addr()->AsAddrMode()->Index();
if ((base == nullptr) || !base->OperIs(GT_LCL_VAR) || comp->lvaVarAddrExposed(base->AsLclVar()->GetLclNum()))
{
// Base must be a local. It's possible for it to be nullptr when index is not null,
// but let's ignore such cases.
return false;
}

if ((index != nullptr) &&
(!index->OperIs(GT_LCL_VAR) || comp->lvaVarAddrExposed(index->AsLclVar()->GetLclNum())))
{
// Index should be either nullptr or a local.
return false;
}

data->baseAddr = base == nullptr ? nullptr : base;
data->index = index == nullptr ? nullptr : index;
data->scale = ind->Addr()->AsAddrMode()->GetScale();
data->offset = ind->Addr()->AsAddrMode()->Offset();
}
else if (ind->Addr()->OperIs(GT_LCL_VAR) && !comp->lvaVarAddrExposed(ind->Addr()->AsLclVar()->GetLclNum()))
{
// Address is just a local, no offset, scale is 1
data->baseAddr = ind->Addr();
data->index = nullptr;
data->scale = 1;
data->offset = 0;
}
else
{
// Address is not LEA or local.
return false;
}
return true;
}

//------------------------------------------------------------------------
// LowerStoreIndirCoalescing: If the given STOREIND node is followed by a similar
// STOREIND node, try to merge them into a single store of a twice wider type. Example:
//
// * STOREIND int
// +--* LCL_VAR byref V00
// \--* CNS_INT int 0x1
//
// * STOREIND int
// +--* LEA(b+4) byref
// | \--* LCL_VAR byref V00
// \--* CNS_INT int 0x2
//
// We can merge these two into into a single store of 8 bytes with (0x1 | (0x2 << 32)) as the value
//
// * STOREIND long
// +--* LEA(b+0) byref
// | \--* LCL_VAR byref V00
// \--* CNS_INT long 0x200000001
//
// Arguments:
// ind - the current STOREIND node
//
void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
{
// LA, RISC-V and ARM32 more likely to recieve a terrible performance hit from
// unaligned accesses making this optimization questionable.
#if defined(TARGET_XARCH) || defined(TARGET_ARM64)
if (!comp->opts.OptimizationEnabled())
{
return;
}

// For now, we require the current STOREIND to have LEA (previous store may not have it)
// So we can easily adjust the offset, consider making it more flexible in future.
if (!ind->Addr()->OperIs(GT_LEA))
{
return;
}

// We're going to do it in a loop while we see suitable STOREINDs to coalesce.
// E.g.: we have the following LIR sequence:
//
// ...addr nodes...
// STOREIND(int)
// ...addr nodes...
// STOREIND(short)
// ...addr nodes...
// STOREIND(short) <-- we're here
//
// First we merge two 'short' stores, then we merge the result with the 'int' store
// to get a single store of 8 bytes.
do
{
// This check is not really needed, just for better throughput.
if (!ind->TypeIs(TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT))
{
return;
}

StoreCoalescingData currData;
StoreCoalescingData prevData;

// Get coalescing data for the current STOREIND
if (!GetStoreCoalescingData(comp, ind, &currData))
{
return;
}

bool isClosedRange = false;
// Now we need to find the very first LIR node representing the current STOREIND
// and make sure that there are no other unexpected nodes in-between.
LIR::ReadOnlyRange currIndRange = BlockRange().GetTreeRange(ind, &isClosedRange);
if (!isClosedRange)
{
return;
}
GenTree* prevTree = currIndRange.FirstNode()->gtPrev;
// Now we need to find the previous STOREIND,
// we can ignore any NOPs or IL_OFFSETs in-between
while ((prevTree != nullptr) && prevTree->OperIs(GT_NOP, GT_IL_OFFSET))
{
prevTree = prevTree->gtPrev;
}

// It's not a STOREIND - bail out.
if ((prevTree == nullptr) || !prevTree->OperIs(GT_STOREIND))
{
return;
}

// Get coalescing data for the previous STOREIND
GenTreeStoreInd* prevInd = prevTree->AsStoreInd();
if (!GetStoreCoalescingData(comp, prevInd->AsStoreInd(), &prevData))
{
return;
}

// Same for the previous STOREIND, make sure there are no unexpected nodes around.
LIR::ReadOnlyRange prevIndRange = BlockRange().GetTreeRange(prevInd, &isClosedRange);
if (!isClosedRange)
{
return;
}

// STOREIND aren't value nodes.
LIR::Use use;
assert(!BlockRange().TryGetUse(prevInd, &use) && !BlockRange().TryGetUse(ind, &use));

// BaseAddr, Index, Scale and Type all have to match.
if ((prevData.scale != currData.scale) || (prevData.targetType != currData.targetType) ||
!GenTree::Compare(prevData.baseAddr, currData.baseAddr) ||
!GenTree::Compare(prevData.index, currData.index))
{
return;
}

// Offset has to match the size of the type. We don't support the same or overlapping offsets.
if (abs(prevData.offset - currData.offset) != (int)genTypeSize(prevData.targetType))
{
return;
}

// Since we're merging two stores of the same type, the new type is twice wider.
var_types oldType = ind->TypeGet();
var_types newType;
switch (oldType)
{
case TYP_BYTE:
case TYP_UBYTE:
newType = TYP_USHORT;
break;

case TYP_SHORT:
case TYP_USHORT:
newType = TYP_INT; // TYP_UINT is not legal in IR
break;

#ifdef TARGET_64BIT
case TYP_INT:
newType = TYP_LONG;
break;
#endif // TARGET_64BIT

// TYP_FLOAT and TYP_DOUBLE aren't needed here - they're expected to
// be converted to TYP_INT/TYP_LONG for constant value.
//
// TODO-CQ:
// 2 x LONG/REF -> SIMD16
// 2 x SIMD16 -> SIMD32
// 2 x SIMD32 -> SIMD64
//
// where it's legal (e.g. SIMD is not atomic on x64)
//
default:
return;
}

// Delete previous STOREIND entirely
BlockRange().Remove(std::move(prevIndRange));

// We know it's always LEA for now
GenTreeAddrMode* addr = ind->Addr()->AsAddrMode();

// Update offset to be the minimum of the two
addr->SetOffset(min(prevData.offset, currData.offset));

// Update type for both STOREIND and val
ind->gtType = newType;
ind->Data()->gtType = newType;

// We currently only support these constants for val
assert(prevData.value->IsCnsIntOrI() && currData.value->IsCnsIntOrI());

size_t lowerCns = (size_t)prevData.value->AsIntCon()->IconValue();
size_t upperCns = (size_t)currData.value->AsIntCon()->IconValue();

// if the previous store was at a higher address, swap the constants
if (prevData.offset > currData.offset)
{
std::swap(lowerCns, upperCns);
}

// Trim the constants to the size of the type, e.g. for TYP_SHORT and TYP_USHORT
// the mask will be 0xFFFF, for TYP_INT - 0xFFFFFFFF.
size_t mask = ~(size_t(0)) >> (sizeof(size_t) - genTypeSize(oldType)) * BITS_IN_BYTE;
lowerCns &= mask;
upperCns &= mask;

size_t val = (lowerCns | (upperCns << (genTypeSize(oldType) * BITS_IN_BYTE)));
JITDUMP("Coalesced two stores into a single store with value %lld\n", (int64_t)val);

// It's not expected to be contained yet, but just in case...
ind->Data()->ClearContained();
ind->Data()->AsIntCon()->gtIconVal = (ssize_t)val;
ind->gtFlags |= GTF_IND_UNALIGNED;

} while (true);
#endif // TARGET_XARCH || TARGET_ARM64
}

//------------------------------------------------------------------------
// LowerStoreIndirCommon: a common logic to lower StoreIndir.
//
Expand Down Expand Up @@ -7842,6 +8121,7 @@ void Lowering::LowerStoreIndirCommon(GenTreeStoreInd* ind)
}
#endif

LowerStoreIndirCoalescing(ind);
LowerStoreIndir(ind);
}
}
Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/lower.h
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,7 @@ class Lowering final : public Phase
void LowerStoreIndirCommon(GenTreeStoreInd* ind);
void LowerIndir(GenTreeIndir* ind);
void LowerStoreIndir(GenTreeStoreInd* node);
void LowerStoreIndirCoalescing(GenTreeStoreInd* node);
GenTree* LowerAdd(GenTreeOp* node);
GenTree* LowerMul(GenTreeOp* mul);
bool TryLowerAndNegativeOne(GenTreeOp* node, GenTree** nextNode);
Expand Down

0 comments on commit 6b25c3b

Please sign in to comment.