-
Notifications
You must be signed in to change notification settings - Fork 12.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[CodeGenPrepare] Reverse the canonicalization of isInf/isNanOrInf #81572
Conversation
@llvm/pr-subscribers-backend-powerpc @llvm/pr-subscribers-backend-aarch64 Author: Yingwei Zheng (dtcxzyw) ChangesIn commit 2b58244, we canonicalize the isInf/isNanOrInf idiom into fabs+fcmp for better analysis/codegen (See also the discussion in #76338). This patch reverses the fabs+fcmp to Fixes the regression introduced by 2b58244 and #80414 (comment). It is an alternative to #81404. Patch is 66.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/81572.diff 7 Files Affected:
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 09c4922d8822cc..33e4aedb392d75 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -1943,6 +1943,30 @@ static bool swapICmpOperandsToExposeCSEOpportunities(CmpInst *Cmp) {
return false;
}
+static bool foldFCmpToFPClassTest(CmpInst *Cmp) {
+ FCmpInst *FCmp = dyn_cast<FCmpInst>(Cmp);
+ if (!FCmp)
+ return false;
+
+ // Reverse the canonicalization if it is a FP class test
+ auto ShouldReverseTransform = [](FPClassTest ClassTest) {
+ return ClassTest == fcInf || ClassTest == (fcInf | fcNan);
+ };
+ auto [ClassVal, ClassTest] =
+ fcmpToClassTest(FCmp->getPredicate(), *FCmp->getParent()->getParent(),
+ FCmp->getOperand(0), FCmp->getOperand(1));
+ if (ClassVal && (ShouldReverseTransform(ClassTest) ||
+ ShouldReverseTransform(~ClassTest))) {
+ IRBuilder<> Builder(Cmp);
+ Value *IsFPClass = Builder.createIsFPClass(ClassVal, ClassTest);
+ Cmp->replaceAllUsesWith(IsFPClass);
+ RecursivelyDeleteTriviallyDeadInstructions(Cmp);
+ return true;
+ }
+
+ return false;
+}
+
bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
if (sinkCmpExpression(Cmp, *TLI))
return true;
@@ -1959,6 +1983,9 @@ bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
if (swapICmpOperandsToExposeCSEOpportunities(Cmp))
return true;
+ if (foldFCmpToFPClassTest(Cmp))
+ return true;
+
return false;
}
diff --git a/llvm/test/CodeGen/AArch64/isinf.ll b/llvm/test/CodeGen/AArch64/isinf.ll
index 458bd7eeba16cf..834417b98743a8 100644
--- a/llvm/test/CodeGen/AArch64/isinf.ll
+++ b/llvm/test/CodeGen/AArch64/isinf.ll
@@ -58,22 +58,14 @@ define i32 @replace_isinf_call_f64(double %x) {
define i32 @replace_isinf_call_f128(fp128 %x) {
; CHECK-LABEL: replace_isinf_call_f128:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #32
-; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 32
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: str q0, [sp]
-; CHECK-NEXT: ldrb w8, [sp, #15]
-; CHECK-NEXT: and w8, w8, #0x7f
-; CHECK-NEXT: strb w8, [sp, #15]
-; CHECK-NEXT: adrp x8, .LCPI3_0
-; CHECK-NEXT: ldr q0, [sp]
-; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI3_0]
-; CHECK-NEXT: bl __eqtf2
-; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: str q0, [sp, #-16]!
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: ldp x9, x8, [sp], #16
+; CHECK-NEXT: and x8, x8, #0x7fffffffffffffff
+; CHECK-NEXT: eor x8, x8, #0x7fff000000000000
+; CHECK-NEXT: orr x8, x9, x8
+; CHECK-NEXT: cmp x8, #0
; CHECK-NEXT: cset w0, eq
-; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: ret
%abs = tail call fp128 @llvm.fabs.f128(fp128 %x)
%cmpinf = fcmp oeq fp128 %abs, 0xL00000000000000007FFF000000000000
diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
index 6fa7df913812a3..ed9ce4d62383b1 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
@@ -61,10 +61,10 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %
; SI-NEXT: s_load_dword s0, s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000
+; SI-NEXT: v_mov_b32_e32 v0, 0x207
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cmp_nlg_f32_e64 s[0:1], |s0|, v0
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -72,11 +72,11 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000
+; VI-NEXT: v_mov_b32_e32 v0, 0x207
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_nlg_f32_e64 s[2:3], |s2|, v0
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -88,7 +88,7 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x7f800000, |s2|
+; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x207
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
@@ -143,25 +143,29 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o
; SI-LABEL: test_isfinite_pattern_0:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dword s0, s[0:1], 0xb
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, 0x1f8
+; SI-NEXT: v_mov_b32_e32 v0, 0x1fb
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT: v_cmp_o_f32_e64 s[0:1], s2, s2
+; SI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
+; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_isfinite_pattern_0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
+; VI-NEXT: v_mov_b32_e32 v0, 0x1fb
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
+; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s4, s4
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT: s_and_b64 s[2:3], s[2:3], vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -173,8 +177,10 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s2
+; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1fb
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s2, s3, s2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
@@ -349,13 +355,13 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000
+; SI-NEXT: v_mov_b32_e32 v0, 0x1fb
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: v_cmp_o_f32_e64 s[0:1], s2, s2
-; SI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s3|, v0
-; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT: v_cmp_class_f32_e32 vcc, s3, v0
+; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -363,11 +369,11 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur
; VI-LABEL: test_isfinite_not_pattern_2:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000
+; VI-NEXT: v_mov_b32_e32 v0, 0x1fb
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cmp_o_f32_e64 s[4:5], s2, s2
-; VI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s3|, v0
-; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s3, v0
+; VI-NEXT: s_and_b64 s[2:3], s[4:5], vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -380,7 +386,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2
-; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, |s3|
+; GFX11-NEXT: v_cmp_class_f32_e64 s3, s3, 0x1fb
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s2, s2, s3
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
@@ -405,11 +411,11 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000
+; SI-NEXT: v_mov_b32_e32 v0, 0x1fb
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cmp_u_f32_e64 s[0:1], s2, s2
-; SI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s2|, v0
-; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
+; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -418,11 +424,11 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000
+; VI-NEXT: v_mov_b32_e32 v0, 0x1fb
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cmp_u_f32_e64 s[2:3], s4, s4
-; VI-NEXT: v_cmp_neq_f32_e64 s[4:5], |s4|, v0
-; VI-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5]
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT: s_and_b64 s[2:3], s[2:3], vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -437,7 +443,7 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cmp_u_f32_e64 s3, s2, s2
-; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, |s2|
+; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1fb
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s2, s3, s2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
@@ -458,25 +464,29 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o
; SI-LABEL: test_isfinite_pattern_4:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dword s0, s[0:1], 0xb
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_mov_b32_e32 v0, 0x1f8
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT: v_cmp_o_f32_e64 s[0:1], s2, s2
+; SI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
+; SI-NEXT: s_and_b64 s[0:1], s[0:1], vcc
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_isfinite_pattern_4:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
+; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s4, s4
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT: s_and_b64 s[2:3], s[2:3], vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -488,8 +498,10 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s2
; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s2, s3, s2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
@@ -508,25 +520,29 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1)
; SI-LABEL: test_isfinite_pattern_4_commute_and:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dword s0, s[0:1], 0xb
+; SI-NEXT: s_load_dword s2, s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_mov_b32_e32 v0, 0x1f8
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cmp_class_f32_e32 vcc, s0, v0
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT: v_cmp_o_f32_e64 s[0:1], s2, s2
+; SI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
+; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_isfinite_pattern_4_commute_and:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
+; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s4, s4
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT: s_and_b64 s[2:3], vcc, s[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -538,8 +554,10 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1)
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s2
; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s2, s2, s3
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
@@ -618,16 +636,16 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp
define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
; SI-LABEL: test_isinf_pattern_f16:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dword s0, s[0:1], 0xb
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s1, 0x7f800000
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e64 v0, |s0|
-; SI-NEXT: v_cmp_eq_f32_e32 vcc, s1, v0
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_and_b32 s4, s4, 0x7fff
+; SI-NEXT: s_cmpk_eq_i32 s4, 0x7c00
+; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_isinf_pattern_f16:
@@ -667,27 +685,32 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou
define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
; SI-LABEL: test_isfinite_pattern_0_f16:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dword s0, s[0:1], 0xb
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_movk_i32 s1, 0x1f8
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
-; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, s1
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT: s_and_b32 s4, s4, 0x7fff
+; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; SI-NEXT: s_cmpk_lg_i32 s4, 0x7c00
+; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_isfinite_pattern_0_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
+; VI-NEXT: v_mov_b32_e32 v0, 0x1fb
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0
+; VI-NEXT: v_cmp_o_f16_e64 s[2:3], s4, s4
+; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0
+; VI-NEXT: s_and_b64 s[2:3], s[2:3], vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -699,8 +722,10 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_o_f16_e64 s3, s2, s2
+; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1fb
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s2, s3, s2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_nop 0
@@ -718,27 +743,32 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur
define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
; SI-LABEL: test_isfinite_pattern_4_f16:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dword s0, s[0:1], 0xb
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_movk_i32 s1, 0x1f8
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
-; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, s1
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT: s_and_b32 s4, s4, 0x7fff
+; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; SI-NEXT: s_cmpk_lt_i32 s4, 0x7c00
+; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_isfinite_...
[truncated]
|
define i1 @test_fp128_is_inf_or_nan(fp128 %arg) { | ||
; CHECK-LABEL: define i1 @test_fp128_is_inf_or_nan( | ||
; CHECK-SAME: fp128 [[ARG:%.*]]) { | ||
; CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.is.fpclass.f128(fp128 [[ARG]], i32 519) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doesn't fully reverse the canonicalization. We still lower @llvm.is.fpclass.f128() using runtime library functions in the x86 backend.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If is.fpclass is not natively supported, SDAG will expand it.
Godbolt: https://godbolt.org/z/dY3r3cr7j
Ping. |
1 similar comment
Ping. |
ping for review. X86 required this patch to fix below regression: https://gcc.godbolt.org/z/8P1do36qd |
Still has the minor outstanding comment about checking legality |
Sorry for the delay. |
Ping @andykaylor |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM for x86 tests.
I will fix it later. |
; P8-NEXT: xscvdpspn 0, 1 | ||
; P8-NEXT: lis 4, 32639 | ||
; P8-NEXT: ori 4, 4, 65535 | ||
; P8-NEXT: mffprwz 3, 0 | ||
; P8-NEXT: clrlwi 3, 3, 1 | ||
; P8-NEXT: sub 3, 4, 3 | ||
; P8-NEXT: rldicl 3, 3, 1, 63 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks worse. The cost of expand for some compares isn't any more than legal operations, since it's just commuting.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ping for review from PowerPC maintainers.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks worse. The cost of expand for some compares isn't any more than legal operations, since it's just commuting.
Before:
Iterations: 100
Instructions: 600
Total Cycles: 1905
Total uOps: 1200
Dispatch Width: 8
uOps Per Cycle: 0.63
IPC: 0.31
Block RThroughput: 1.5
After:
Iterations: 100
Instructions: 700
Total Cycles: 614
Total uOps: 1400
Dispatch Width: 8
uOps Per Cycle: 2.28
IPC: 1.14
Block RThroughput: 1.8
It looks better :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TBH, I prefer the transformed one. The unchanged one uses TOC entry for the inf constant. And the lfs
is time consuming on PPC and has memory access overhead.
; P8-NEXT: xor 4, 4, 5 | ||
; P8-NEXT: or 3, 3, 4 | ||
; P8-NEXT: cntlzd 3, 3 | ||
; P8-NEXT: rldicl 3, 3, 58, 63 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice improvement.
@chenzheng1030 Can I merge this PR? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM for the PPC case changes.
In commit 2b58244, we canonicalize the isInf/isNanOrInf idiom into fabs+fcmp for better analysis/codegen (See also the discussion in #76338).
This patch reverses the fabs+fcmp to
is.fpclass
. If theis.fpclass
is not supported by the target, it will be expanded by TLI.Fixes the regression introduced by 2b58244 and #80414 (comment).
It is an alternative to #81404.