-
Notifications
You must be signed in to change notification settings - Fork 12.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Legalize fminimum and fmaximum f32 for gfx950 #117634
Merged
arsenm
merged 1 commit into
main
from
users/arsenm/gfx950/legalize-fminimum-fmaximum-f32
Nov 26, 2024
Merged
AMDGPU: Legalize fminimum and fmaximum f32 for gfx950 #117634
arsenm
merged 1 commit into
main
from
users/arsenm/gfx950/legalize-fminimum-fmaximum-f32
Nov 26, 2024
Conversation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This was referenced Nov 25, 2024
arsenm
requested review from
pravinjagtap,
rampitec,
shiltian,
Sisyph and
srpande
November 25, 2024 21:59
This was referenced Nov 25, 2024
This was referenced Nov 25, 2024
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-mc Author: Matt Arsenault (arsenm) ChangesSelect to minimum3/maximum3. Leave f16/v2f16 for later Patch is 185.07 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117634.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d35bb15ac6566a..914b25245c95bf 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -855,6 +855,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
{MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
Custom);
+ } else {
+ // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
+ if (Subtarget->hasMinimum3Maximum3F32())
+ setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
}
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 5d4d56e8b0ad22..2b207e008581b3 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1234,6 +1234,23 @@ def : IntClampPat<V_MQSAD_PK_U16_U8_e64, int_amdgcn_mqsad_pk_u16_u8>;
def : IntClampPat<V_QSAD_PK_U16_U8_e64, int_amdgcn_qsad_pk_u16_u8>;
def : IntClampPat<V_MQSAD_U32_U8_e64, int_amdgcn_mqsad_u32_u8>;
+//===----------------------------------------------------------------------===//
+// Floating-point operation Patterns
+//===----------------------------------------------------------------------===//
+
+// Implement fminimum(x, y) by using minimum3(x, y, y)
+class MinimumMaximumByMinimum3Maximum3<SDPatternOperator node, ValueType vt,
+ Instruction inst> : GCNPat<
+ (vt (node (VOP3Mods vt:$src0, i32:$src0_mods), (VOP3Mods vt:$src1, i32:$src1_mods))),
+ (inst $src0_mods, $src0, $src1_mods, $src1, $src1_mods, $src1)
+>;
+
+// Prefer the real 2 operand form if legal
+let SubtargetPredicate = HasMinimum3Maximum3F32, AddedComplexity = -1000 in {
+def : MinimumMaximumByMinimum3Maximum3<fminimum, f32, V_MINIMUM3_F32_e64>;
+def : MinimumMaximumByMinimum3Maximum3<fmaximum, f32, V_MAXIMUM3_F32_e64>;
+}
+
//===----------------------------------------------------------------------===//
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 08122cd0d89eab..209ae86b4dedce 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -14,19 +14,26 @@ define float @v_fmaximum3_f32(float %a, float %b, float %c) {
; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f32_e32 v3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f32_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_max_f32_e32 v3, v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT: v_max_f32_e32 v1, v0, v2
+; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_maximum3_f32 v0, v0, v1, v1
+; GFX950-NEXT: v_maximum3_f32 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call float @llvm.maximum.f32(float %a, float %b)
%max1 = call float @llvm.maximum.f32(float %max0, float %c)
ret float %max1
@@ -43,19 +50,26 @@ define float @v_fmaximum3_f32_commute(float %a, float %b, float %c) {
; GFX12-NEXT: v_maximum3_f32 v0, v2, v0, v1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f32_commute:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f32_e32 v3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f32_e32 v1, v2, v0
-; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32_commute:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_max_f32_e32 v3, v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT: v_max_f32_e32 v1, v2, v0
+; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v0
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32_commute:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_maximum3_f32 v0, v0, v1, v1
+; GFX950-NEXT: v_maximum3_f32 v0, v2, v0, v0
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%max0 = call float @llvm.maximum.f32(float %a, float %b)
%max1 = call float @llvm.maximum.f32(float %c, float %max0)
ret float %max1
@@ -70,21 +84,30 @@ define amdgpu_ps i32 @s_fmaximum3_f32(float inreg %a, float inreg %b, float inre
; GFX12-NEXT: v_readfirstlane_b32 s0, v0
; GFX12-NEXT: ; return to shader part epilog
;
-; GFX9-LABEL: s_fmaximum3_f32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: v_mov_b32_e32 v0, s1
-; GFX9-NEXT: v_max_f32_e32 v1, s0, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s0, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: v_max_f32_e32 v1, s2, v0
-; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s2, v0
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: ; return to shader part epilog
+; GFX940-LABEL: s_fmaximum3_f32:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: v_mov_b32_e32 v0, s1
+; GFX940-NEXT: v_max_f32_e32 v1, s0, v0
+; GFX940-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s0, v0
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX940-NEXT: v_max_f32_e32 v1, s2, v0
+; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s2, v0
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_readfirstlane_b32 s0, v0
+; GFX940-NEXT: ; return to shader part epilog
+;
+; GFX950-LABEL: s_fmaximum3_f32:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: v_mov_b32_e32 v0, s0
+; GFX950-NEXT: v_maximum3_f32 v0, v0, s1, s1
+; GFX950-NEXT: v_maximum3_f32 v0, v0, s2, s2
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_readfirstlane_b32 s0, v0
+; GFX950-NEXT: ; return to shader part epilog
%max0 = call float @llvm.maximum.f32(float %a, float %b)
%max1 = call float @llvm.maximum.f32(float %max0, float %c)
%cast = bitcast float %max1 to i32
@@ -103,19 +126,26 @@ define float @v_fmaximum3_f32_fabs0(float %a, float %b, float %c) {
; GFX12-NEXT: v_maximum3_f32 v0, |v0|, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f32_fabs0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f32_e64 v3, |v0|, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f32_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32_fabs0:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_max_f32_e64 v3, |v0|, v1
+; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT: v_cmp_o_f32_e64 vcc, |v0|, v1
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT: v_max_f32_e32 v1, v0, v2
+; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32_fabs0:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_maximum3_f32 v0, |v0|, v1, v1
+; GFX950-NEXT: v_maximum3_f32 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call float @llvm.fabs.f32(float %a)
%max0 = call float @llvm.maximum.f32(float %a.fabs, float %b)
%max1 = call float @llvm.maximum.f32(float %max0, float %c)
@@ -133,19 +163,26 @@ define float @v_fmaximum3_f32_fabs1(float %a, float %b, float %c) {
; GFX12-NEXT: v_maximum3_f32 v0, v0, |v1|, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f32_fabs1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f32_e64 v3, v0, |v1|
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v1|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f32_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32_fabs1:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_max_f32_e64 v3, v0, |v1|
+; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, |v1|
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT: v_max_f32_e32 v1, v0, v2
+; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32_fabs1:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_maximum3_f32 v0, v0, |v1|, |v1|
+; GFX950-NEXT: v_maximum3_f32 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%b.fabs = call float @llvm.fabs.f32(float %b)
%max0 = call float @llvm.maximum.f32(float %a, float %b.fabs)
%max1 = call float @llvm.maximum.f32(float %max0, float %c)
@@ -163,19 +200,26 @@ define float @v_fmaximum3_f32_fabs2(float %a, float %b, float %c) {
; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, |v2|
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f32_fabs2:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f32_e32 v3, v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f32_e64 v1, v0, |v2|
-; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v2|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32_fabs2:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_max_f32_e32 v3, v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT: v_max_f32_e64 v1, v0, |v2|
+; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32_fabs2:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_maximum3_f32 v0, v0, v1, v1
+; GFX950-NEXT: v_maximum3_f32 v0, v0, |v2|, |v2|
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%c.fabs = call float @llvm.fabs.f32(float %c)
%max0 = call float @llvm.maximum.f32(float %a, float %b)
%max1 = call float @llvm.maximum.f32(float %max0, float %c.fabs)
@@ -193,19 +237,26 @@ define float @v_fmaximum3_f32_fabs_all(float %a, float %b, float %c) {
; GFX12-NEXT: v_maximum3_f32 v0, |v0|, |v1|, |v2|
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f32_fabs_all:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f32_e64 v3, |v0|, |v1|
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v1|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f32_e64 v1, v0, |v2|
-; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v2|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32_fabs_all:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_max_f32_e64 v3, |v0|, |v1|
+; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v1|
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT: v_max_f32_e64 v1, v0, |v2|
+; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32_fabs_all:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_maximum3_f32 v0, |v0|, |v1|, |v1|
+; GFX950-NEXT: v_maximum3_f32 v0, v0, |v2|, |v2|
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call float @llvm.fabs.f32(float %a)
%b.fabs = call float @llvm.fabs.f32(float %b)
%c.fabs = call float @llvm.fabs.f32(float %c)
@@ -225,19 +276,26 @@ define float @v_fmaximum3_f32_fneg_all(float %a, float %b, float %c) {
; GFX12-NEXT: v_maximum3_f32 v0, -v0, -v1, -v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f32_fneg_all:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f32_e64 v3, -v0, -v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f32_e64 v1, v0, -v2
-; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32_fneg_all:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_max_f32_e64 v3, -v0, -v1
+; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v1
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT: v_max_f32_e64 v1, v0, -v2
+; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, -v2
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32_fneg_all:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_maximum3_f32 v0, -v0, -v1, -v1
+; GFX950-NEXT: v_maximum3_f32 v0, v0, -v2, -v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fneg = fneg float %a
%b.fneg = fneg float %b
%c.fneg = fneg float %c
@@ -257,19 +315,26 @@ define float @v_fmaximum3_f32_fneg_fabs_all(float %a, float %b, float %c) {
; GFX12-NEXT: v_maximum3_f32 v0, -|v0|, -|v1|, -|v2|
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f32_fneg_fabs_all:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f32_e64 v3, -|v0|, -|v1|
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -|v0|, -|v1|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f32_e64 v1, v0, -|v2|
-; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -|v2|
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32_fneg_fabs_all:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_max_f32_e64 v3, -|v0|, -|v1|
+; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT: v_cmp_o_f32_e64 vcc, -|v0|, -|v1|
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT: v_max_f32_e64 v1, v0, -|v2|
+; GFX940-NEXT: v_cmp_o_f32_e64 vcc, v0, -|v2|
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32_fneg_fabs_all:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_maximum3_f32 v0, -|v0|, -|v1|, -|v1|
+; GFX950-NEXT: v_maximum3_f32 v0, v0, -|v2|, -|v2|
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call float @llvm.fabs.f32(float %a)
%b.fabs = call float @llvm.fabs.f32(float %b)
%c.fabs = call float @llvm.fabs.f32(float %c)
@@ -292,19 +357,26 @@ define float @v_fmaximum3_f32_fneg0(float %a, float %b, float %c) {
; GFX12-NEXT: v_maximum3_f32 v0, -v0, v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f32_fneg0:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f32_e64 v3, -v0, v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f32_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32_fneg0:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_max_f32_e64 v3, -v0, v1
+; GFX940-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT: v_cmp_o_f32_e64 vcc, -v0, v1
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT: v_max_f32_e32 v1, v0, v2
+; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32_fneg0:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_maximum3_f32 v0, -v0, v1, v1
+; GFX950-NEXT: v_maximum3_f32 v0, v0, v2, v2
+; GFX950-NEXT: s_setpc_b64 s[30:31]
%a.fneg = fneg float %a
%max0 = call float @llvm.maximum.f32(float %a.fneg, float %b)
%max1 = call float @llvm.maximum.f32(float %max0, float %c)
@@ -322,19 +394,26 @@ define float @v_fmaximum3_f32_fneg1(float %a, float %b, float %c) {
; GFX12-NEXT: v_maximum3_f32 v0, v0, -v1, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fmaximum3_f32_fneg1:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_max_f32_e64 v3, v0, -v1
-; GFX9-NEXT: v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v1
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT: v_max_f32_e32 v1, v0, v2
-; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32_fneg1:
+; GFX940: ; %bb.0:
+; GFX940-NE...
[truncated]
|
This was referenced Nov 25, 2024
This was referenced Nov 26, 2024
Select to minimum3/maximum3. Leave f16/v2f16 for later since it's complicated by only having the vector version.
arsenm
force-pushed
the
users/arsenm/gfx950/legalize-fminimum-fmaximum-f32
branch
from
November 26, 2024 19:42
593f540
to
3a72829
Compare
arsenm
deleted the
users/arsenm/gfx950/legalize-fminimum-fmaximum-f32
branch
November 26, 2024 19:44
This was referenced Nov 26, 2024
Merged
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Select to minimum3/maximum3. Leave f16/v2f16 for later
since it's complicated by only having the vector version.