Skip to content

Commit

Permalink
lgc: kill unused outputs based on channelWriteMask
Browse files Browse the repository at this point in the history
This is an optimization of output caculations based on
channelWriteMask of the pipeline stats from two aspects:
- Remove export instructions for color targets that have a 0 write mask
- Replace components that aren't used according to the write mask with
  'poison'

(cherry picked from commit fa12be36e0bca4a1ac9c0f4a1be7c6e3e0d63d33)
  • Loading branch information
xuechen417 committed Oct 27, 2023
1 parent a2ed744 commit dc096a2
Show file tree
Hide file tree
Showing 10 changed files with 117 additions and 123 deletions.
2 changes: 1 addition & 1 deletion lgc/include/lgc/patch/FragColorExport.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class FragColorExport {
FragColorExport &operator=(const FragColorExport &) = delete;

llvm::Value *handleColorExportInstructions(llvm::Value *output, unsigned int hwColorExport, BuilderBase &builder,
ExportFormat expFmt, const bool signedness);
ExportFormat expFmt, const bool signedness, unsigned channelWriteMask);

llvm::Value *convertToHalf(llvm::Value *value, bool signedness, BuilderBase &builder) const;
llvm::Value *convertToFloat(llvm::Value *value, bool signedness, BuilderBase &builder) const;
Expand Down
1 change: 1 addition & 0 deletions lgc/interface/lgc/Pipeline.h
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,7 @@ struct ColorExportFormat {
unsigned blendEnable; // Blend will be enabled for this target at draw time
unsigned blendSrcAlphaToColor; // Whether source alpha is blended to color channels for this target
// at draw time
unsigned channelWriteMask; // Write mask to specify destination channels
};

// Struct to pass to SetColorExportState
Expand Down
216 changes: 104 additions & 112 deletions lgc/patch/FragColorExport.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ LowerFragColorExport::LowerFragColorExport() : m_exportValues(MaxColorTargets +
// @param input : The value we want to extract elements from
// @param builder : The IR builder for inserting instructions
// @param [out] results : The returned elements
static void extractElements(Value *input, BuilderBase &builder, SmallVectorImpl<Value *> &results) {
static void extractElements(Value *input, BuilderBase &builder, std::array<Value *, 4> &results) {
Type *valueTy = input->getType();
unsigned compCount = valueTy->isVectorTy() ? cast<FixedVectorType>(valueTy)->getNumElements() : 1;
assert(compCount <= 4 && "At-most four elements allowed\n");
Expand All @@ -94,8 +94,10 @@ static void extractElements(Value *input, BuilderBase &builder, SmallVectorImpl<
// @param builder : The IR builder for inserting instructions
// @param expFmt: The format for the given render target
// @param signedness: If output should be interpreted as a signed integer
// @param channelWriteMask: Write mask to specify destination channels
Value *FragColorExport::handleColorExportInstructions(Value *output, unsigned hwColorExport, BuilderBase &builder,
ExportFormat expFmt, const bool signedness) {
ExportFormat expFmt, const bool signedness,
unsigned channelWriteMask) {
assert(expFmt != EXP_FORMAT_ZERO);

Type *outputTy = output->getType();
Expand All @@ -116,10 +118,18 @@ Value *FragColorExport::handleColorExportInstructions(Value *output, unsigned hw
floatTy, // EXP_FORMAT_32_ABGR = 9,
};

SmallVector<Value *, 4> comps(4);
const auto undefFloat = PoisonValue::get(builder.getFloatTy());
const auto undefFloat16x2 = PoisonValue::get(FixedVectorType::get(builder.getHalfTy(), 2));

std::array<Value *, 4> comps;
std::array<Value *, 4> exports{undefFloat, undefFloat, undefFloat, undefFloat};
unsigned exportMask = 0;

Type *exportTy = exportTypeMapping[expFmt];

const bool dualSourceBlendedEnable = m_pipelineState->getColorExportState().dualSourceBlendEnable ||
m_pipelineState->getColorExportState().dynamicDualSourceBlendEnable;

// For 32bit output, we always to scalarize, but for 16bit output we may just operate on vector.
if (exportTy->isFloatTy()) {
if (compCount == 1) {
Expand All @@ -130,56 +140,56 @@ Value *FragColorExport::handleColorExportInstructions(Value *output, unsigned hw
}
}

const auto undefFloat = PoisonValue::get(builder.getFloatTy());
const auto undefFloat16x2 = PoisonValue::get(FixedVectorType::get(builder.getHalfTy(), 2));

switch (expFmt) {
case EXP_FORMAT_32_R: {
compCount = 1;
comps[0] = convertToFloat(comps[0], signedness, builder);
break;
}
case EXP_FORMAT_32_GR: {
if (compCount >= 2) {
case EXP_FORMAT_32_R:
case EXP_FORMAT_32_GR:
case EXP_FORMAT_32_ABGR: {
if (expFmt == EXP_FORMAT_32_GR && compCount >= 2)
compCount = 2;
comps[0] = convertToFloat(comps[0], signedness, builder);
comps[1] = convertToFloat(comps[1], signedness, builder);
} else {
else if (expFmt != EXP_FORMAT_32_ABGR)
compCount = 1;
comps[0] = convertToFloat(comps[0], signedness, builder);

for (unsigned idx = 0; idx < compCount; ++idx) {
unsigned compMask = 1 << idx;
if (compMask & channelWriteMask) {
exports[idx] = convertToFloat(comps[idx], signedness, builder);
exportMask |= compMask;
}
}
break;
}
case EXP_FORMAT_32_AR: {
if (1 & channelWriteMask) {
exports[0] = convertToFloat(comps[0], signedness, builder);
exportMask = 1;
}
if (compCount == 4) {
if (0x8 & channelWriteMask) {
exports[1] = convertToFloat(comps[3], signedness, builder);
exportMask |= 0x2;
}
compCount = 2;
comps[0] = convertToFloat(comps[0], signedness, builder);
comps[1] = convertToFloat(comps[3], signedness, builder);
} else {
compCount = 1;
comps[0] = convertToFloat(comps[0], signedness, builder);
}
break;
}
case EXP_FORMAT_32_ABGR: {
for (unsigned i = 0; i < compCount; ++i)
comps[i] = convertToFloat(comps[i], signedness, builder);

for (unsigned i = compCount; i < 4; ++i)
comps[i] = undefFloat;
break;
}
case EXP_FORMAT_FP16_ABGR: {
const unsigned compactCompCount = (compCount + 1) / 2;
exports[0] = exports[1] = undefFloat16x2;
// convert to half type
if (bitWidth <= 16) {
output = convertToHalf(output, signedness, builder);
extractElements(output, builder, comps);
// re-pack
comps[0] = builder.CreateInsertElement(undefFloat16x2, comps[0], builder.getInt32(0));
comps[0] = builder.CreateInsertElement(comps[0], comps[1], builder.getInt32(1));
if (compCount > 2) {
comps[1] = builder.CreateInsertElement(undefFloat16x2, comps[2], builder.getInt32(0));
comps[1] = builder.CreateInsertElement(comps[1], comps[3], builder.getInt32(1));
for (unsigned idx = 0; idx < compactCompCount; ++idx) {
unsigned origIdx = 2 * idx;
unsigned compMask = (1 << origIdx) | (1 << (origIdx + 1));
if (compMask & channelWriteMask) {
exports[idx] = builder.CreateInsertElement(undefFloat16x2, comps[origIdx], builder.getInt32(0));
exports[idx] = builder.CreateInsertElement(exports[idx], comps[origIdx + 1], builder.getInt32(1));
exportMask |= compMask;
}
}
} else {
if (outputTy->isIntOrIntVectorTy())
Expand All @@ -188,47 +198,47 @@ Value *FragColorExport::handleColorExportInstructions(Value *output, unsigned hw
: builder.getFloatTy());
extractElements(output, builder, comps);

Attribute::AttrKind attribs[] = {Attribute::ReadNone};
comps[0] = builder.CreateNamedCall("llvm.amdgcn.cvt.pkrtz", FixedVectorType::get(builder.getHalfTy(), 2),
{comps[0], comps[1]}, attribs);
if (compCount > 2)
comps[1] = builder.CreateNamedCall("llvm.amdgcn.cvt.pkrtz", FixedVectorType::get(builder.getHalfTy(), 2),
{comps[2], comps[3]}, attribs);
for (unsigned idx = 0; idx < compactCompCount; ++idx) {
unsigned origIdx = 2 * idx;
unsigned compMask = (1 << origIdx) | (1 << (origIdx + 1));
if (compMask & channelWriteMask) {
exports[idx] = builder.CreateIntrinsic(FixedVectorType::get(builder.getHalfTy(), 2),
Intrinsic::amdgcn_cvt_pkrtz, {comps[origIdx], comps[origIdx + 1]});
exportMask |= compMask;
}
}
}
break;
}
case EXP_FORMAT_UNORM16_ABGR:
case EXP_FORMAT_SNORM16_ABGR: {
output = convertToFloat(output, signedness, builder);
extractElements(output, builder, comps);

StringRef funcName =
expFmt == EXP_FORMAT_SNORM16_ABGR ? "llvm.amdgcn.cvt.pknorm.i16" : "llvm.amdgcn.cvt.pknorm.u16";

for (unsigned idx = 0; idx < (compCount + 1) / 2; idx++) {
Value *packedComps = builder.CreateNamedCall(funcName, FixedVectorType::get(builder.getInt16Ty(), 2),
{comps[2 * idx], comps[2 * idx + 1]}, {});

comps[idx] = builder.CreateBitCast(packedComps, FixedVectorType::get(builder.getHalfTy(), 2));
}

break;
}
case EXP_FORMAT_SNORM16_ABGR:
case EXP_FORMAT_UINT16_ABGR:
case EXP_FORMAT_SINT16_ABGR: {
assert(compCount <= 4);
output = convertToInt(output, signedness, builder);
extractElements(output, builder, comps);

StringRef funcName = expFmt == EXP_FORMAT_SINT16_ABGR ? "llvm.amdgcn.cvt.pk.i16" : "llvm.amdgcn.cvt.pk.u16";

for (unsigned idx = 0; idx < (compCount + 1) / 2; idx++) {
Value *packedComps = builder.CreateNamedCall(funcName, FixedVectorType::get(builder.getInt16Ty(), 2),
{comps[2 * idx], comps[2 * idx + 1]}, {});

comps[idx] = builder.CreateBitCast(packedComps, FixedVectorType::get(builder.getHalfTy(), 2));
unsigned cvtIntrinsic;
if (expFmt == EXP_FORMAT_SNORM16_ABGR || expFmt == EXP_FORMAT_UNORM16_ABGR) {
output = convertToFloat(output, signedness, builder);
cvtIntrinsic =
expFmt == EXP_FORMAT_SNORM16_ABGR ? Intrinsic::amdgcn_cvt_pknorm_i16 : Intrinsic::amdgcn_cvt_pknorm_u16;
} else {
output = convertToInt(output, signedness, builder);
cvtIntrinsic = expFmt == EXP_FORMAT_SINT16_ABGR ? Intrinsic::amdgcn_cvt_pk_i16 : Intrinsic::amdgcn_cvt_pk_u16;
}
extractElements(output, builder, comps);

const unsigned compactCompCount = (compCount + 1) / 2;
exports[0] = exports[1] = undefFloat16x2;
for (unsigned idx = 0; idx < compactCompCount; idx++) {
unsigned origIdx = 2 * idx;
unsigned compMask = (1 << origIdx) | (1 << (origIdx + 1));
if (compMask & channelWriteMask) {
Value *packedComps = builder.CreateIntrinsic(FixedVectorType::get(builder.getInt16Ty(), 2), cvtIntrinsic,
{comps[2 * idx], comps[2 * idx + 1]});
exports[idx] = builder.CreateBitCast(packedComps, FixedVectorType::get(builder.getHalfTy(), 2));
exportMask |= compMask;
}
}
break;
}
default: {
Expand All @@ -237,67 +247,48 @@ Value *FragColorExport::handleColorExportInstructions(Value *output, unsigned hw
}
}

if (m_pipelineState->getTargetInfo().getGfxIpVersion().major >= 11 &&
(m_pipelineState->getColorExportState().dualSourceBlendEnable ||
m_pipelineState->getColorExportState().dynamicDualSourceBlendEnable)) {
// Save them for later dual-source-swizzle
m_blendSourceChannels = exportTy->isHalfTy() ? (compCount + 1) / 2 : compCount;
assert(hwColorExport <= 1);
m_blendSources[hwColorExport].append(comps.begin(), comps.end());
return nullptr;
if (m_pipelineState->getTargetInfo().getGfxIpVersion().major >= 11) {
if (dualSourceBlendedEnable) {
// Save them for later dual-source-swizzle
m_blendSourceChannels = exportTy->isHalfTy() ? (compCount + 1) / 2 : compCount;
assert(hwColorExport <= 1);
m_blendSources[hwColorExport].append(exports.begin(), exports.end());
return nullptr;
} else if (exportTy->isHalfTy()) {
// GFX11 removes compressed export, simply use 32bit-data export.
exportMask = 0;
const unsigned compactCompCount = (compCount + 1) / 2;
for (unsigned idx = 0; idx < compactCompCount; ++idx) {
exports[idx] = builder.CreateBitCast(exports[idx], builder.getFloatTy());
exportMask |= 1 << idx;
}
for (unsigned idx = compactCompCount; idx < 4; ++idx)
exports[idx] = undefFloat;
}
}

Value *exportCall = nullptr;

if (exportTy->isHalfTy()) {
// GFX11 removes compressed export, simply use 32bit-data export.
if (m_pipelineState->getTargetInfo().getGfxIpVersion().major >= 11) {
// Translate compCount into the number of 32bit data.
compCount = (compCount + 1) / 2;
for (unsigned i = 0; i < compCount; i++)
comps[i] = builder.CreateBitCast(comps[i], builder.getFloatTy());
for (unsigned i = compCount; i < 4; i++)
comps[i] = undefFloat;

Value *args[] = {
builder.getInt32(EXP_TARGET_MRT_0 + hwColorExport), // tgt
builder.getInt32((1 << compCount) - 1), // en
comps[0], // src0
comps[1], // src1
comps[2], // src2
comps[3], // src3
builder.getFalse(), // done
builder.getTrue() // vm
};

return builder.CreateNamedCall("llvm.amdgcn.exp.f32", Type::getVoidTy(*m_context), args, {});
}

if (exportTy->isHalfTy() && m_pipelineState->getTargetInfo().getGfxIpVersion().major < 11) {
// 16-bit export (compressed)
if (compCount <= 2)
comps[1] = undefFloat16x2;
Value *args[] = {
builder.getInt32(EXP_TARGET_MRT_0 + hwColorExport), // tgt
builder.getInt32(compCount > 2 ? 0xF : 0x3), // en
comps[0], // src0
comps[1], // src1
builder.getInt32(exportMask), // en
exports[0], // src0
exports[1], // src1
builder.getFalse(), // done
builder.getTrue() // vm
};

exportCall = builder.CreateNamedCall("llvm.amdgcn.exp.compr.v2f16", Type::getVoidTy(*m_context), args, {});
} else {
// 32-bit export
for (unsigned i = compCount; i < 4; i++)
comps[i] = undefFloat;

Value *args[] = {
builder.getInt32(EXP_TARGET_MRT_0 + hwColorExport), // tgt
builder.getInt32((1 << compCount) - 1), // en
comps[0], // src0
comps[1], // src1
comps[2], // src2
comps[3], // src3
builder.getInt32(exportMask), // en
exports[0], // src0
exports[1], // src1
exports[2], // src2
exports[3], // src3
builder.getFalse(), // done
builder.getTrue() // vm
};
Expand Down Expand Up @@ -942,9 +933,10 @@ void FragColorExport::generateExportInstructions(ArrayRef<ColorExportInfo> info,
assert(infoIt->hwColorTarget < MaxColorTargets);

auto expFmt = static_cast<ExportFormat>(m_pipelineState->computeExportFormat(infoIt->ty, location));
if (expFmt != EXP_FORMAT_ZERO) {
const unsigned channelWriteMask = m_pipelineState->getColorExportFormat(location).channelWriteMask;
if (expFmt != EXP_FORMAT_ZERO && channelWriteMask != 0) {
lastExport = handleColorExportInstructions(values[infoIt->hwColorTarget], hwColorExport, builder, expFmt,
infoIt->isSigned);
infoIt->isSigned, channelWriteMask);
finalExportFormats.push_back(expFmt);
++hwColorExport;
}
Expand Down
6 changes: 3 additions & 3 deletions lgc/state/PalMetadata.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1065,9 +1065,9 @@ void PalMetadata::updateCbShaderMask(llvm::ArrayRef<ColorExportInfo> exps) {
for (auto &exp : exps) {
if (exp.hwColorTarget == MaxColorTargets)
continue;

if (m_pipelineState->computeExportFormat(exp.ty, exp.location) != 0) {
cbShaderMask |= (0xF << (4 * exp.location));
const unsigned channelWriteMask = m_pipelineState->getColorExportFormat(exp.location).channelWriteMask;
if (m_pipelineState->computeExportFormat(exp.ty, exp.location) != 0 && channelWriteMask != 0) {
cbShaderMask |= (channelWriteMask << (4 * exp.location));
}
}

Expand Down
2 changes: 1 addition & 1 deletion lgc/state/PipelineState.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1212,7 +1212,7 @@ void PipelineState::recordColorExportState(Module *module) {

// The color export formats named metadata node's operands are:
// - N metadata nodes for N color targets, each one containing
// { dfmt, nfmt, blendEnable, blendSrcAlphaToColor }
// { dfmt, nfmt, blendEnable, blendSrcAlphaToColor, channelWriteMask }
for (const ColorExportFormat &target : m_colorExportFormats)
exportFormatsMetaNode->addOperand(getArrayOfInt32MetaNode(getContext(), target, /*atLeastOneValue=*/true));
}
Expand Down
4 changes: 2 additions & 2 deletions lgc/test/ElfRelocationSize.lgc
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ target triple = "amdgcn--amdpal"
!1 = !{i32 -1843601953, i32 337452067, i32 -1234379640, i32 1173800166}
!2 = !{i32 2}
!4 = !{i32 -225903757, i32 -647980161, i32 1491774676, i32 -114025882}
!18 = !{i32 10}
!18 = !{i32 10, i32 0, i32 0, i32 0, i32 15}
!19 = !{i32 0, i32 0, i32 0, i32 0, i32 13, i32 7, i32 -1}
!20 = !{i32 1, i32 0, i32 24, i32 0, i32 11, i32 7, i32 -1}
!21 = !{i32 0, i32 0, i32 0, i32 1}
Expand Down Expand Up @@ -131,7 +131,7 @@ attributes #3 = { nounwind readonly willreturn }
!10 = !{!"DescriptorTableVaPtr", i32 0, i32 0, i32 11, i32 1, i32 1}
!11 = !{!"DescriptorResource", i32 1, i32 0, i32 0, i32 80, i32 0, i32 1, i32 8}
!14 = !{!"DescriptorSampler", i32 2, i32 0, i32 -1, i32 4, i32 0, i32 2, i32 4, i32 12288, i32 117436416, i32 1750073344, i32 -2147483648}
!19 = !{i32 10}
!19 = !{i32 10, i32 0, i32 0, i32 0, i32 15}
!20 = !{i32 6}

; ----------------------------------------------------------------------
Expand Down
4 changes: 2 additions & 2 deletions lgc/test/PartPipeline.lgc
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ target triple = "amdgcn--amdpal"
!1 = !{i32 -1843601953, i32 337452067, i32 -1234379640, i32 1173800166}
!2 = !{i32 2}
!4 = !{i32 -225903757, i32 -647980161, i32 1491774676, i32 -114025882}
!18 = !{i32 10}
!18 = !{i32 10, i32 0, i32 0, i32 0, i32 15}
!19 = !{i32 0, i32 0, i32 0, i32 0, i32 13, i32 7, i32 -1}
!20 = !{i32 1, i32 0, i32 24, i32 0, i32 11, i32 7, i32 -1}
!21 = !{i32 0, i32 0, i32 0, i32 1}
Expand Down Expand Up @@ -140,7 +140,7 @@ attributes #3 = { nounwind readonly willreturn }
!10 = !{!"DescriptorTableVaPtr", i32 0, i32 0, i32 11, i32 1, i32 1}
!11 = !{!"DescriptorResource", i32 1, i32 0, i32 0, i32 80, i32 0, i32 1, i32 8}
!14 = !{!"DescriptorSampler", i32 2, i32 0, i32 -1, i32 4, i32 0, i32 2, i32 4, i32 12288, i32 117436416, i32 1750073344, i32 -2147483648}
!19 = !{i32 10}
!19 = !{i32 10, i32 0, i32 0, i32 0, i32 15}
!20 = !{i32 6}

; ----------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion lgc/test/TextureRange.lgc
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ attributes #3 = { nounwind readnone }
!10 = !{!"InlineBuffer", i32 14, i32 0, i32 3, i32 1, i32 -1610612736, i32 4, i32 4}
!11 = !{!"DescriptorBufferCompact", i32 10, i32 0, i32 4, i32 4, i32 -536870912, i32 0, i32 2}
!12 = !{!"IndirectUserDataVaPtr", i32 0, i32 0, i32 7, i32 1, i32 0}
!13 = !{i32 16}
!13 = !{i32 16, i32 0, i32 0, i32 0, i32 15}
!14 = !{i32 3, i32 3}
!15 = !{!"\82\B0amdpal.pipelines\91\84\AA.registers\80\B0.spill_threshold\CE\FF\FF\FF\FF\B0.user_data_limit\00\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\9B\97\D5d\9F\E5\B7\11\CF\E9#\B4W\05\EA\C6\A7\AD.llpc_version\A453.5\AEamdpal.version\92\02\03"}
!16 = !{i32 0}
Expand Down
Loading

0 comments on commit dc096a2

Please sign in to comment.