From 3dcdc2b345db9bd889ad4faa89f70272118cce7f Mon Sep 17 00:00:00 2001 From: Rex Xu Date: Fri, 17 Nov 2023 17:06:33 +0800 Subject: [PATCH] Fix performance regression of normalize The PR https://github.com/GPUOpen-Drivers/llpc/pull/2778 tries to resolve the signed zero issue of normalize. But it unconditionally adds v_cmp and v_cndmask instructions. It causes performance drop. Therefore, we add check of NSZ flag. When NSZ is specified, we still follow previous handling. --- lgc/builder/ArithBuilder.cpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/lgc/builder/ArithBuilder.cpp b/lgc/builder/ArithBuilder.cpp index 69dc6e41d6..efa9e1d5f9 100644 --- a/lgc/builder/ArithBuilder.cpp +++ b/lgc/builder/ArithBuilder.cpp @@ -833,13 +833,23 @@ Value *BuilderImpl::CreateNormalizeVector(Value *x, const Twine &instName) { Value *dot = CreateDotProduct(x, x); Value *sqrt = CreateSqrt(dot); Value *rsq = CreateFDiv(ConstantFP::get(sqrt->getType(), 1.0), sqrt); + Value *result = nullptr; if (x->getType()->getScalarType()->isFloatTy()) { // Make sure a FP32 zero vector is normalized to a FP32 zero vector, rather than NaNs. - auto zero = ConstantFP::get(getFloatTy(), 0.0); - auto isZeroDot = CreateFCmpOEQ(dot, zero); - rsq = CreateSelect(isZeroDot, zero, rsq); + if (!getFastMathFlags().noSignedZeros()) { + // When NSZ is not specified, we avoid using fmul_legacy since the sign of the input is dropped. + auto zero = ConstantFP::get(getFloatTy(), 0.0); + auto isZeroDot = CreateFCmpOEQ(dot, zero); + rsq = CreateSelect(isZeroDot, zero, rsq); + result = scalarize(x, [this, rsq](Value *x) -> Value * { return CreateFMul(x, rsq); }); + } else { + result = scalarize(x, [this, rsq](Value *x) -> Value * { + return CreateIntrinsic(Intrinsic::amdgcn_fmul_legacy, {}, {x, rsq}); + }); + } + } else { + result = scalarize(x, [this, rsq](Value *x) -> Value * { return CreateFMul(x, rsq); }); } - Value *result = scalarize(x, [this, rsq](Value *x) -> Value * { return CreateFMul(x, rsq); }); result->setName(instName); return result; }