From 5c68a1cb123161b54b72ce90e7975d95a8eaf2a4 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Mon, 26 Sep 2022 23:07:49 -0400 Subject: [PATCH] AMDGPU: Make various vector undefs legal Surprisingly these were getting legalized to something zero initialized. This fixes an infinite loop when combining some vector types. Also fixes zero initializing some undef values. SimplifyDemandedVectorElts / SimplifyDemandedBits are not checking for the legality of the output undefs they are replacing unused operations with. This resulted in turning vectors into undefs that were later re-legalized back into zero vectors. (cherry picked from commit 7a84624079a2656c684bed6100708544500c5a32) --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 + llvm/test/CodeGen/AMDGPU/commute-shifts.ll | 16 -- .../AMDGPU/cross-block-use-is-not-abi-copy.ll | 4 +- .../CodeGen/AMDGPU/dagcombine-fma-fmad.ll | 10 +- .../CodeGen/AMDGPU/extract-subvector-16bit.ll | 78 +------ llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 14 +- llvm/test/CodeGen/AMDGPU/select-undef.ll | 219 +++++++++++++++++- llvm/test/CodeGen/AMDGPU/skip-if-dead.ll | 113 ++------- llvm/test/CodeGen/AMDGPU/v1024.ll | 2 + .../CodeGen/AMDGPU/vgpr-tuple-allocation.ll | 144 ++---------- llvm/test/CodeGen/AMDGPU/wqm.ll | 162 ++----------- 11 files changed, 306 insertions(+), 458 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index f7d139adc63bac..f6b7d1ffc6d27c 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -249,6 +249,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, case ISD::STORE: case ISD::BUILD_VECTOR: case ISD::BITCAST: + case ISD::UNDEF: case ISD::EXTRACT_VECTOR_ELT: case ISD::INSERT_VECTOR_ELT: case ISD::EXTRACT_SUBVECTOR: @@ -516,6 +517,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, case ISD::STORE: case ISD::BUILD_VECTOR: case ISD::BITCAST: + case ISD::UNDEF: case ISD::EXTRACT_VECTOR_ELT: case ISD::INSERT_VECTOR_ELT: case ISD::INSERT_SUBVECTOR: diff --git a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll index 8df85ba872bfbf..3697946cb5c398 100644 --- a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll +++ b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll @@ -5,14 +5,6 @@ define amdgpu_ps float @main(float %arg0, float %arg1) #0 { ; SI-LABEL: main: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: s_mov_b32 s1, s0 -; SI-NEXT: s_mov_b32 s2, s0 -; SI-NEXT: s_mov_b32 s3, s0 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s0 -; SI-NEXT: s_mov_b32 s6, s0 -; SI-NEXT: s_mov_b32 s7, s0 ; SI-NEXT: image_load v2, v0, s[0:7] dmask:0x1 unorm ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: v_and_b32_e32 v0, 7, v0 @@ -26,14 +18,6 @@ define amdgpu_ps float @main(float %arg0, float %arg1) #0 { ; ; VI-LABEL: main: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_mov_b32 s1, s0 -; VI-NEXT: s_mov_b32 s2, s0 -; VI-NEXT: s_mov_b32 s3, s0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s0 -; VI-NEXT: s_mov_b32 s6, s0 -; VI-NEXT: s_mov_b32 s7, s0 ; VI-NEXT: image_load v2, v0, s[0:7] dmask:0x1 unorm ; VI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; VI-NEXT: v_and_b32_e32 v0, 7, v0 diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll index 29fc098899ee5c..5d985850446cc3 100644 --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -213,7 +213,7 @@ if.else: ; preds = %entry br label %if.end if.end: ; preds = %if.else, %if.then - %call6.sink = phi <3 x i16> [ %call6, %if.else ], [ undef, %if.then ] + %call6.sink = phi <3 x i16> [ %call6, %if.else ], [ zeroinitializer, %if.then ] store <3 x i16> %call6.sink, <3 x i16> addrspace(1)* undef ret void } @@ -266,7 +266,7 @@ if.else: ; preds = %entry br label %if.end if.end: ; preds = %if.else, %if.then - %call6.sink = phi <3 x half> [ %call6, %if.else ], [ undef, %if.then ] + %call6.sink = phi <3 x half> [ %call6, %if.else ], [ zeroinitializer, %if.then ] store <3 x half> %call6.sink, <3 x half> addrspace(1)* undef ret void } diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll index 8af7575f03d06d..0b629efffbb30e 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll @@ -4,16 +4,8 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 { ; GCN-LABEL: _amdgpu_ps_main: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: s_mov_b32 s1, s0 -; GCN-NEXT: s_mov_b32 s2, s0 -; GCN-NEXT: s_mov_b32 s3, s0 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s0 -; GCN-NEXT: s_mov_b32 s6, s0 -; GCN-NEXT: s_mov_b32 s7, s0 ; GCN-NEXT: image_sample v[0:1], v[0:1], s[0:7], s[0:3] dmask:0x3 dim:SQ_RSRC_IMG_2D +; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_clause 0x1 ; GCN-NEXT: image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll index 6456c87a31fbfc..cbfd8ec5cb16e6 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -100,14 +100,7 @@ define <4 x i16> @vec_8xi16_extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16 ; GFX9-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-NEXT: s_branch .LBB0_4 ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s8 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_mov_b32_e32 v4, s10 -; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: .LBB0_3: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -244,14 +237,7 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i ; GFX9-NEXT: s_cbranch_execz .LBB1_3 ; GFX9-NEXT: s_branch .LBB1_4 ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s8 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_mov_b32_e32 v4, s10 -; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: .LBB1_3: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -386,14 +372,7 @@ define <4 x half> @vec_8xf16_extract_4xf16(<8 x half> addrspace(1) * %p0, <8 x h ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: s_branch .LBB2_4 ; GFX9-NEXT: .LBB2_2: -; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s8 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_mov_b32_e32 v4, s10 -; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: .LBB2_3: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -567,22 +546,7 @@ define <4 x i16> @vec_16xi16_extract_4xi16(<16 x i16> addrspace(1) * %p0, <16 x ; GFX9-NEXT: s_cbranch_execz .LBB3_3 ; GFX9-NEXT: s_branch .LBB3_4 ; GFX9-NEXT: .LBB3_2: -; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s8 -; GFX9-NEXT: s_mov_b32 s12, s8 -; GFX9-NEXT: s_mov_b32 s13, s8 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_mov_b32 s15, s8 -; GFX9-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, s9 -; GFX9-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-NEXT: v_mov_b32_e32 v7, s11 -; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-NEXT: v_mov_b32_e32 v10, s14 -; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; GFX9-NEXT: .LBB3_3: ; %T ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc @@ -759,22 +723,7 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(<16 x i16> addrspace(1) * %p0, <16 ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: s_branch .LBB4_4 ; GFX9-NEXT: .LBB4_2: -; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s8 -; GFX9-NEXT: s_mov_b32 s12, s8 -; GFX9-NEXT: s_mov_b32 s13, s8 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_mov_b32 s15, s8 -; GFX9-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, s9 -; GFX9-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-NEXT: v_mov_b32_e32 v7, s11 -; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-NEXT: v_mov_b32_e32 v10, s14 -; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; GFX9-NEXT: .LBB4_3: ; %T ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc @@ -949,22 +898,7 @@ define <4 x half> @vec_16xf16_extract_4xf16(<16 x half> addrspace(1) * %p0, <16 ; GFX9-NEXT: s_cbranch_execz .LBB5_3 ; GFX9-NEXT: s_branch .LBB5_4 ; GFX9-NEXT: .LBB5_2: -; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s8 -; GFX9-NEXT: s_mov_b32 s12, s8 -; GFX9-NEXT: s_mov_b32 s13, s8 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_mov_b32 s15, s8 -; GFX9-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, s9 -; GFX9-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-NEXT: v_mov_b32_e32 v7, s11 -; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-NEXT: v_mov_b32_e32 v10, s14 -; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; GFX9-NEXT: .LBB5_3: ; %T ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index cc4ece6c7059f7..f742d2c0bda4d5 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -374,18 +374,10 @@ define <4 x float> @insertelement_to_sgpr() nounwind { ; GCN-LABEL: insertelement_to_sgpr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s12, 0 -; GCN-NEXT: s_mov_b32 s4, s12 -; GCN-NEXT: s_mov_b32 s5, s12 -; GCN-NEXT: s_mov_b32 s6, s12 -; GCN-NEXT: s_mov_b32 s7, s12 -; GCN-NEXT: s_mov_b32 s8, s12 -; GCN-NEXT: s_mov_b32 s9, s12 -; GCN-NEXT: s_mov_b32 s10, s12 -; GCN-NEXT: s_mov_b32 s11, s12 -; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1 +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[4:7] dmask:0x1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef diff --git a/llvm/test/CodeGen/AMDGPU/select-undef.ll b/llvm/test/CodeGen/AMDGPU/select-undef.ll index 6597d6784e0c23..f02cd3fc5e4e65 100644 --- a/llvm/test/CodeGen/AMDGPU/select-undef.ll +++ b/llvm/test/CodeGen/AMDGPU/select-undef.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}select_undef_lhs: ; GCN: s_waitcnt @@ -43,3 +43,220 @@ define void @select_undef_n2(float addrspace(1)* %a, i32 %c) { } declare float @llvm.amdgcn.rcp.f32(float) + + +; Make sure the vector undef isn't lowered into 0s. +; GCN-LABEL: {{^}}undef_v6f32: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v6f32(<6 x float> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <6 x float> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <6 x float>, <6 x float> addrspace(3)* undef + %add = fadd <6 x float> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <6 x float> %add, <6 x float> addrspace(3)* undef + ret void +} + +; GCN-LABEL: {{^}}undef_v6i32: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v6i32(<6 x i32> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <6 x i32> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <6 x i32>, <6 x i32> addrspace(3)* undef + %add = add <6 x i32> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <6 x i32> %add, <6 x i32> addrspace(3)* undef + ret void +} + +; Make sure the vector undef isn't lowered into 0s. +; GCN-LABEL: {{^}}undef_v5f32: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v5f32(<5 x float> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <5 x float> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <5 x float>, <5 x float> addrspace(3)* undef + %add = fadd <5 x float> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <5 x float> %add, <5 x float> addrspace(3)* undef + ret void +} + +; GCN-LABEL: {{^}}undef_v5i32: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v5i32(<5 x i32> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <5 x i32> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <5 x i32>, <5 x i32> addrspace(3)* undef + %add = add <5 x i32> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <5 x i32> %add, <5 x i32> addrspace(3)* undef + ret void +} + +; Make sure the vector undef isn't lowered into 0s. +; GCN-LABEL: {{^}}undef_v3f64: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v3f64(<3 x double> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <3 x double> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <3 x double>, <3 x double> addrspace(3)* %ptr + %add = fadd <3 x double> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <3 x double> %add, <3 x double> addrspace(3)* %ptr + ret void +} + +; GCN-LABEL: {{^}}undef_v3i64: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v3i64(<3 x i64> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <3 x i64> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <3 x i64>, <3 x i64> addrspace(3)* %ptr + %add = add <3 x i64> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <3 x i64> %add, <3 x i64> addrspace(3)* %ptr + ret void +} + +; Make sure the vector undef isn't lowered into 0s. +; GCN-LABEL: {{^}}undef_v4f16: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v4f16(<4 x half> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <4 x half> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <4 x half>, <4 x half> addrspace(3)* %ptr + %add = fadd <4 x half> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <4 x half> %add, <4 x half> addrspace(3)* %ptr + ret void +} + +; GCN-LABEL: {{^}}undef_v4i16: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v4i16(<4 x i16> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <4 x i16> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <4 x i16>, <4 x i16> addrspace(3)* %ptr + %add = add <4 x i16> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <4 x i16> %add, <4 x i16> addrspace(3)* %ptr + ret void +} + +; Make sure the vector undef isn't lowered into 0s. +; GCN-LABEL: {{^}}undef_v2f16: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v2f16(<2 x half> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <2 x half> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <2 x half>, <2 x half> addrspace(3)* %ptr + %add = fadd <2 x half> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <2 x half> %add, <2 x half> addrspace(3)* %ptr + ret void +} + +; GCN-LABEL: {{^}}undef_v2i16: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v2i16(<2 x i16> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <2 x i16> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <2 x i16>, <2 x i16> addrspace(3)* %ptr + %add = add <2 x i16> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <2 x i16> %add, <2 x i16> addrspace(3)* %ptr + ret void +} + +; We were expanding undef vectors into zero vectors. Optimizations +; would then see we used no elements of the vector, and reform the +; undef vector resulting in a combiner loop. +; GCN-LABEL: {{^}}inf_loop_undef_vector: +; GCN: s_waitcnt +; GCN-NEXT: v_mad_u64_u32 +; GCN-NEXT: v_mul_lo_u32 +; GCN-NEXT: v_mul_lo_u32 +; GCN-NEXT: v_add3_u32 +; GCN-NEXT: global_store_dwordx2 +define void @inf_loop_undef_vector(<6 x float> %arg, float %arg1, i64 %arg2) { + %i = insertelement <6 x float> %arg, float %arg1, i64 2 + %i3 = bitcast <6 x float> %i to <3 x i64> + %i4 = extractelement <3 x i64> %i3, i64 0 + %i5 = extractelement <3 x i64> %i3, i64 1 + %i6 = mul i64 %i5, %arg2 + %i7 = add i64 %i6, %i4 + store volatile i64 %i7, i64 addrspace(1)* undef, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index ada6c1da04e2ca..7080c84f7b50a2 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -1397,28 +1397,20 @@ bb7: ; preds = %bb4 define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, float %arg3) #0 { ; SI-LABEL: if_after_kill_block: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_mov_b64 s[2:3], exec +; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: s_wqm_b64 exec, exec -; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; SI-NEXT: s_cbranch_execz .LBB13_3 ; SI-NEXT: ; %bb.1: ; %bb3 ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc ; SI-NEXT: s_cbranch_scc0 .LBB13_6 ; SI-NEXT: ; %bb.2: ; %bb3 ; SI-NEXT: s_andn2_b64 exec, exec, vcc ; SI-NEXT: .LBB13_3: ; %bb4 -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_mov_b32 s1, s0 -; SI-NEXT: s_mov_b32 s2, s0 -; SI-NEXT: s_mov_b32 s3, s0 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s0 -; SI-NEXT: s_mov_b32 s6, s0 -; SI-NEXT: s_mov_b32 s7, s0 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 @@ -1439,28 +1431,20 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, ; ; GFX10-WAVE64-LABEL: if_after_kill_block: ; GFX10-WAVE64: ; %bb.0: ; %bb -; GFX10-WAVE64-NEXT: s_mov_b64 s[2:3], exec +; GFX10-WAVE64-NEXT: s_mov_b64 s[0:1], exec ; GFX10-WAVE64-NEXT: s_wqm_b64 exec, exec ; GFX10-WAVE64-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1 -; GFX10-WAVE64-NEXT: s_mov_b32 s0, 0 -; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX10-WAVE64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10-WAVE64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX10-WAVE64-NEXT: s_cbranch_execz .LBB13_3 ; GFX10-WAVE64-NEXT: ; %bb.1: ; %bb3 ; GFX10-WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; GFX10-WAVE64-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc +; GFX10-WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc ; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB13_6 ; GFX10-WAVE64-NEXT: ; %bb.2: ; %bb3 ; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, vcc ; GFX10-WAVE64-NEXT: .LBB13_3: ; %bb4 -; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX10-WAVE64-NEXT: s_mov_b32 s1, s0 -; GFX10-WAVE64-NEXT: s_mov_b32 s2, s0 -; GFX10-WAVE64-NEXT: s_mov_b32 s3, s0 -; GFX10-WAVE64-NEXT: s_mov_b32 s4, s0 -; GFX10-WAVE64-NEXT: s_mov_b32 s5, s0 -; GFX10-WAVE64-NEXT: s_mov_b32 s6, s0 -; GFX10-WAVE64-NEXT: s_mov_b32 s7, s0 +; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-WAVE64-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D ; GFX10-WAVE64-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 @@ -1479,28 +1463,20 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, ; ; GFX10-WAVE32-LABEL: if_after_kill_block: ; GFX10-WAVE32: ; %bb.0: ; %bb -; GFX10-WAVE32-NEXT: s_mov_b32 s1, exec_lo +; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-WAVE32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-WAVE32-NEXT: v_cmp_nle_f32_e32 vcc_lo, 0, v1 -; GFX10-WAVE32-NEXT: s_mov_b32 s0, 0 -; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-WAVE32-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-WAVE32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB13_3 ; GFX10-WAVE32-NEXT: ; %bb.1: ; %bb3 ; GFX10-WAVE32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 -; GFX10-WAVE32-NEXT: s_andn2_b32 s1, s1, vcc_lo +; GFX10-WAVE32-NEXT: s_andn2_b32 s0, s0, vcc_lo ; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB13_6 ; GFX10-WAVE32-NEXT: ; %bb.2: ; %bb3 ; GFX10-WAVE32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo ; GFX10-WAVE32-NEXT: .LBB13_3: ; %bb4 -; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-WAVE32-NEXT: s_mov_b32 s1, s0 -; GFX10-WAVE32-NEXT: s_mov_b32 s2, s0 -; GFX10-WAVE32-NEXT: s_mov_b32 s3, s0 -; GFX10-WAVE32-NEXT: s_mov_b32 s4, s0 -; GFX10-WAVE32-NEXT: s_mov_b32 s5, s0 -; GFX10-WAVE32-NEXT: s_mov_b32 s6, s0 -; GFX10-WAVE32-NEXT: s_mov_b32 s7, s0 +; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-WAVE32-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D ; GFX10-WAVE32-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 @@ -1519,29 +1495,22 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, ; ; GFX11-LABEL: if_after_kill_block: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_mov_b64 s[2:3], exec +; GFX11-NEXT: s_mov_b64 s[0:1], exec ; GFX11-NEXT: s_wqm_b64 exec, exec -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b64 s[4:5], exec +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[2:3], exec ; GFX11-NEXT: v_cmpx_nle_f32_e32 0, v1 -; GFX11-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX11-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11-NEXT: s_cbranch_execz .LBB13_3 ; GFX11-NEXT: ; %bb.1: ; %bb3 ; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], vcc +; GFX11-NEXT: s_and_not1_b64 s[0:1], s[0:1], vcc ; GFX11-NEXT: s_cbranch_scc0 .LBB13_6 ; GFX11-NEXT: ; %bb.2: ; %bb3 ; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc ; GFX11-NEXT: .LBB13_3: ; %bb4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s2, s0 -; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: s_mov_b32 s5, s0 -; GFX11-NEXT: s_mov_b32 s6, s0 -; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D ; GFX11-NEXT: s_mov_b64 s[0:1], exec ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1584,19 +1553,11 @@ bb9: ; preds = %bb4 define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) { ; SI-LABEL: cbranch_kill: ; SI: ; %bb.0: ; %.entry -; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: v_mov_b32_e32 v4, 0 ; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: s_mov_b32 s5, s4 -; SI-NEXT: s_mov_b32 s6, s4 -; SI-NEXT: s_mov_b32 s7, s4 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s4 -; SI-NEXT: s_mov_b32 s10, s4 -; SI-NEXT: s_mov_b32 s11, s4 -; SI-NEXT: image_sample_l v1, v[1:4], s[4:11], s[0:3] dmask:0x1 da +; SI-NEXT: image_sample_l v1, v[1:4], s[0:7], s[0:3] dmask:0x1 da ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_f32_e32 vcc, 0, v1 ; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc @@ -1627,16 +1588,8 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) { ; GFX10-WAVE64-LABEL: cbranch_kill: ; GFX10-WAVE64: ; %bb.0: ; %.entry ; GFX10-WAVE64-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-WAVE64-NEXT: s_mov_b32 s4, 0 ; GFX10-WAVE64-NEXT: s_mov_b64 s[0:1], exec -; GFX10-WAVE64-NEXT: s_mov_b32 s5, s4 -; GFX10-WAVE64-NEXT: s_mov_b32 s6, s4 -; GFX10-WAVE64-NEXT: s_mov_b32 s7, s4 -; GFX10-WAVE64-NEXT: s_mov_b32 s8, s4 -; GFX10-WAVE64-NEXT: s_mov_b32 s9, s4 -; GFX10-WAVE64-NEXT: s_mov_b32 s10, s4 -; GFX10-WAVE64-NEXT: s_mov_b32 s11, s4 -; GFX10-WAVE64-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX10-WAVE64-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-WAVE64-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE64-NEXT: v_cmp_ge_f32_e32 vcc, 0, v1 ; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[2:3], vcc @@ -1667,16 +1620,8 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) { ; GFX10-WAVE32-LABEL: cbranch_kill: ; GFX10-WAVE32: ; %bb.0: ; %.entry ; GFX10-WAVE32-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-WAVE32-NEXT: s_mov_b32 s4, 0 ; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo -; GFX10-WAVE32-NEXT: s_mov_b32 s5, s4 -; GFX10-WAVE32-NEXT: s_mov_b32 s6, s4 -; GFX10-WAVE32-NEXT: s_mov_b32 s7, s4 -; GFX10-WAVE32-NEXT: s_mov_b32 s8, s4 -; GFX10-WAVE32-NEXT: s_mov_b32 s9, s4 -; GFX10-WAVE32-NEXT: s_mov_b32 s10, s4 -; GFX10-WAVE32-NEXT: s_mov_b32 s11, s4 -; GFX10-WAVE32-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX10-WAVE32-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-WAVE32-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE32-NEXT: v_cmp_ge_f32_e32 vcc_lo, 0, v1 ; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s1, vcc_lo @@ -1707,16 +1652,8 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) { ; GFX11-LABEL: cbranch_kill: ; GFX11: ; %bb.0: ; %.entry ; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], exec -; GFX11-NEXT: s_mov_b32 s5, s4 -; GFX11-NEXT: s_mov_b32 s6, s4 -; GFX11-NEXT: s_mov_b32 s7, s4 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s4 -; GFX11-NEXT: s_mov_b32 s10, s4 -; GFX11-NEXT: s_mov_b32 s11, s4 -; GFX11-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX11-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX11-NEXT: s_mov_b64 s[2:3], exec ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmpx_ge_f32_e32 0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/v1024.ll b/llvm/test/CodeGen/AMDGPU/v1024.ll index a5e0454a36344e..1326ba437f94ff 100644 --- a/llvm/test/CodeGen/AMDGPU/v1024.ll +++ b/llvm/test/CodeGen/AMDGPU/v1024.ll @@ -10,6 +10,7 @@ define amdgpu_kernel void @test_v1024() { entry: %alloca = alloca <32 x i32>, align 16, addrspace(5) %cast = bitcast <32 x i32> addrspace(5)* %alloca to i8 addrspace(5)* + call void @llvm.memset.p5i8.i32(i8 addrspace(5)* %cast, i8 0, i32 128, i1 false) br i1 undef, label %if.then.i.i, label %if.else.i if.then.i.i: ; preds = %entry @@ -24,6 +25,7 @@ if.then.i62.i: ; preds = %if.else.i, %if.then ret void } +declare void @llvm.memset.p5i8.i32(i8 addrspace(5)* nocapture readonly, i8, i32, i1 immarg) declare void @llvm.memcpy.p5i8.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8 addrspace(5)* nocapture readonly, i64, i1 immarg) declare void @llvm.memcpy.p1i8.p5i8.i64(i8 addrspace(1)* nocapture writeonly, i8 addrspace(5)* nocapture readonly, i64, i1 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll index 5164b072a6ddc7..ed0de729dafdfa 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -14,7 +14,6 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_mov_b32_e32 v36, v16 @@ -22,13 +21,6 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX9-NEXT: v_mov_b32_e32 v34, v14 ; GFX9-NEXT: v_mov_b32_e32 v33, v13 ; GFX9-NEXT: v_mov_b32_e32 v32, v12 -; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_mov_b32 s6, s4 -; GFX9-NEXT: s_mov_b32 s7, s4 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s4 -; GFX9-NEXT: s_mov_b32 s10, s4 -; GFX9-NEXT: s_mov_b32 s11, s4 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -82,16 +74,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX10-NEXT: v_mov_b32_e32 v34, v14 ; GFX10-NEXT: v_mov_b32_e32 v33, v13 ; GFX10-NEXT: v_mov_b32_e32 v32, v12 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_mov_b32 s5, s4 -; GFX10-NEXT: s_mov_b32 s6, s4 -; GFX10-NEXT: s_mov_b32 s7, s4 -; GFX10-NEXT: s_mov_b32 s8, s4 -; GFX10-NEXT: s_mov_b32 s9, s4 -; GFX10-NEXT: s_mov_b32 s10, s4 -; GFX10-NEXT: s_mov_b32 s11, s4 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -145,16 +129,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX11-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15 ; GFX11-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 ; GFX11-NEXT: v_mov_b32_e32 v32, v12 -; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s33, 2 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s2, s0 -; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: s_mov_b32 s5, s0 -; GFX11-NEXT: s_mov_b32 s6, s0 -; GFX11-NEXT: s_mov_b32 s7, s0 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 @@ -225,65 +201,41 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 10 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: v_writelane_b32 v40, s36, 2 -; GFX9-NEXT: v_writelane_b32 v40, s37, 3 -; GFX9-NEXT: v_writelane_b32 v40, s38, 4 -; GFX9-NEXT: v_writelane_b32 v40, s39, 5 -; GFX9-NEXT: v_writelane_b32 v40, s40, 6 -; GFX9-NEXT: v_writelane_b32 v40, s41, 7 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: v_writelane_b32 v40, s42, 8 -; GFX9-NEXT: s_mov_b32 s36, 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v40, s43, 9 ; GFX9-NEXT: v_mov_b32_e32 v45, v16 ; GFX9-NEXT: v_mov_b32_e32 v44, v15 ; GFX9-NEXT: v_mov_b32_e32 v43, v14 ; GFX9-NEXT: v_mov_b32_e32 v42, v13 ; GFX9-NEXT: v_mov_b32_e32 v41, v12 -; GFX9-NEXT: s_mov_b32 s37, s36 -; GFX9-NEXT: s_mov_b32 s38, s36 -; GFX9-NEXT: s_mov_b32 s39, s36 -; GFX9-NEXT: s_mov_b32 s40, s36 -; GFX9-NEXT: s_mov_b32 s41, s36 -; GFX9-NEXT: s_mov_b32 s42, s36 -; GFX9-NEXT: s_mov_b32 s43, s36 -; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1 +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1 +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s43, v40, 9 -; GFX9-NEXT: v_readlane_b32 s42, v40, 8 -; GFX9-NEXT: v_readlane_b32 s41, v40, 7 -; GFX9-NEXT: v_readlane_b32 s40, v40, 6 -; GFX9-NEXT: v_readlane_b32 s39, v40, 5 -; GFX9-NEXT: v_readlane_b32 s38, v40, 4 -; GFX9-NEXT: v_readlane_b32 s37, v40, 3 -; GFX9-NEXT: v_readlane_b32 s36, v40, 2 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: v_readlane_b32 s33, v40, 10 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] @@ -298,66 +250,42 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 10 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v41, v16 ; GFX10-NEXT: v_mov_b32_e32 v42, v15 ; GFX10-NEXT: v_mov_b32_e32 v43, v14 -; GFX10-NEXT: v_mov_b32_e32 v44, v13 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_mov_b32_e32 v44, v13 ; GFX10-NEXT: v_mov_b32_e32 v45, v12 -; GFX10-NEXT: v_writelane_b32 v40, s36, 2 -; GFX10-NEXT: s_mov_b32 s36, 0 -; GFX10-NEXT: v_writelane_b32 v40, s37, 3 -; GFX10-NEXT: s_mov_b32 s37, s36 -; GFX10-NEXT: v_writelane_b32 v40, s38, 4 -; GFX10-NEXT: s_mov_b32 s38, s36 -; GFX10-NEXT: v_writelane_b32 v40, s39, 5 -; GFX10-NEXT: s_mov_b32 s39, s36 -; GFX10-NEXT: v_writelane_b32 v40, s40, 6 -; GFX10-NEXT: s_mov_b32 s40, s36 -; GFX10-NEXT: v_writelane_b32 v40, s41, 7 -; GFX10-NEXT: s_mov_b32 s41, s36 -; GFX10-NEXT: v_writelane_b32 v40, s42, 8 -; GFX10-NEXT: s_mov_b32 s42, s36 -; GFX10-NEXT: v_writelane_b32 v40, s43, 9 -; GFX10-NEXT: s_mov_b32 s43, s36 -; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_clause 0x4 ; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 -; GFX10-NEXT: v_readlane_b32 s43, v40, 9 -; GFX10-NEXT: v_readlane_b32 s42, v40, 8 -; GFX10-NEXT: v_readlane_b32 s41, v40, 7 -; GFX10-NEXT: v_readlane_b32 s40, v40, 6 -; GFX10-NEXT: v_readlane_b32 s39, v40, 5 -; GFX10-NEXT: v_readlane_b32 s38, v40, 4 -; GFX10-NEXT: v_readlane_b32 s37, v40, 3 -; GFX10-NEXT: v_readlane_b32 s36, v40, 2 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 10 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -372,7 +300,7 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:20 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 10 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_clause 0x4 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:16 @@ -380,56 +308,32 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:8 ; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:4 ; GFX11-NEXT: scratch_store_b32 off, v45, s33 +; GFX11-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_add_i32 s32, s32, 32 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v41, v16 :: v_dual_mov_b32 v42, v15 ; GFX11-NEXT: v_dual_mov_b32 v43, v14 :: v_dual_mov_b32 v44, v13 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: v_mov_b32_e32 v45, v12 -; GFX11-NEXT: v_writelane_b32 v40, s36, 2 -; GFX11-NEXT: s_mov_b32 s36, 0 -; GFX11-NEXT: v_writelane_b32 v40, s37, 3 -; GFX11-NEXT: s_mov_b32 s37, s36 -; GFX11-NEXT: v_writelane_b32 v40, s38, 4 -; GFX11-NEXT: s_mov_b32 s38, s36 -; GFX11-NEXT: v_writelane_b32 v40, s39, 5 -; GFX11-NEXT: s_mov_b32 s39, s36 -; GFX11-NEXT: v_writelane_b32 v40, s40, 6 -; GFX11-NEXT: s_mov_b32 s40, s36 -; GFX11-NEXT: v_writelane_b32 v40, s41, 7 -; GFX11-NEXT: s_mov_b32 s41, s36 -; GFX11-NEXT: v_writelane_b32 v40, s42, 8 -; GFX11-NEXT: s_mov_b32 s42, s36 -; GFX11-NEXT: v_writelane_b32 v40, s43, 9 -; GFX11-NEXT: s_mov_b32 s43, s36 -; GFX11-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[36:43], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_clause 0x4 ; GFX11-NEXT: scratch_load_b32 v45, off, s33 ; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:4 ; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:8 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:12 ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:16 -; GFX11-NEXT: v_readlane_b32 s43, v40, 9 -; GFX11-NEXT: v_readlane_b32 s42, v40, 8 -; GFX11-NEXT: v_readlane_b32 s41, v40, 7 -; GFX11-NEXT: v_readlane_b32 s40, v40, 6 -; GFX11-NEXT: v_readlane_b32 s39, v40, 5 -; GFX11-NEXT: v_readlane_b32 s38, v40, 4 -; GFX11-NEXT: v_readlane_b32 s37, v40, 3 -; GFX11-NEXT: v_readlane_b32 s36, v40, 2 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 -; GFX11-NEXT: v_readlane_b32 s33, v40, 10 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:20 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index dc85462631d4a7..16c30174657a5f 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -1833,87 +1833,54 @@ main_body: define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind { ; GFX9-W64-LABEL: test_loop_vcc: ; GFX9-W64: ; %bb.0: ; %entry -; GFX9-W64-NEXT: s_mov_b64 s[8:9], exec +; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-W64-NEXT: v_mov_b32_e32 v6, v2 ; GFX9-W64-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-W64-NEXT: s_and_b64 exec, exec, s[8:9] -; GFX9-W64-NEXT: s_mov_b32 s0, 0 -; GFX9-W64-NEXT: s_mov_b32 s1, s0 -; GFX9-W64-NEXT: s_mov_b32 s2, s0 -; GFX9-W64-NEXT: s_mov_b32 s3, s0 -; GFX9-W64-NEXT: s_mov_b32 s4, s0 -; GFX9-W64-NEXT: s_mov_b32 s5, s0 -; GFX9-W64-NEXT: s_mov_b32 s6, s0 -; GFX9-W64-NEXT: s_mov_b32 s7, s0 +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: image_store v[4:7], v0, s[0:7] dmask:0xf unorm ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-W64-NEXT: s_mov_b32 s10, 0x40e00000 +; GFX9-W64-NEXT: s_mov_b32 s4, 0x40e00000 ; GFX9-W64-NEXT: s_branch .LBB31_2 ; GFX9-W64-NEXT: .LBB31_1: ; %body ; GFX9-W64-NEXT: ; in Loop: Header=BB31_2 Depth=1 -; GFX9-W64-NEXT: s_mov_b32 s1, s0 -; GFX9-W64-NEXT: s_mov_b32 s2, s0 -; GFX9-W64-NEXT: s_mov_b32 s3, s0 -; GFX9-W64-NEXT: s_mov_b32 s4, s0 -; GFX9-W64-NEXT: s_mov_b32 s5, s0 -; GFX9-W64-NEXT: s_mov_b32 s6, s0 -; GFX9-W64-NEXT: s_mov_b32 s7, s0 ; GFX9-W64-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf ; GFX9-W64-NEXT: v_add_f32_e32 v8, 2.0, v8 -; GFX9-W64-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-W64-NEXT: s_cbranch_execz .LBB31_4 ; GFX9-W64-NEXT: .LBB31_2: ; %loop ; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s10, v8 +; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s4, v8 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v6 ; GFX9-W64-NEXT: v_mov_b32_e32 v3, v7 ; GFX9-W64-NEXT: s_cbranch_vccz .LBB31_1 ; GFX9-W64-NEXT: ; %bb.3: -; GFX9-W64-NEXT: s_mov_b64 s[2:3], -1 ; GFX9-W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-W64-NEXT: ; implicit-def: $vgpr8 ; GFX9-W64-NEXT: .LBB31_4: ; %break -; GFX9-W64-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_loop_vcc: ; GFX10-W32: ; %bb.0: ; %entry -; GFX10-W32-NEXT: s_mov_b32 s8, exec_lo +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-W32-NEXT: s_mov_b32 s0, 0 -; GFX10-W32-NEXT: s_mov_b32 s1, s0 -; GFX10-W32-NEXT: s_mov_b32 s2, s0 -; GFX10-W32-NEXT: s_mov_b32 s3, s0 -; GFX10-W32-NEXT: s_mov_b32 s4, s0 -; GFX10-W32-NEXT: s_mov_b32 s5, s0 -; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8 -; GFX10-W32-NEXT: s_mov_b32 s6, s0 -; GFX10-W32-NEXT: s_mov_b32 s7, s0 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-W32-NEXT: image_store v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: s_branch .LBB31_2 ; GFX10-W32-NEXT: .p2align 6 ; GFX10-W32-NEXT: .LBB31_1: ; %body ; GFX10-W32-NEXT: ; in Loop: Header=BB31_2 Depth=1 -; GFX10-W32-NEXT: s_mov_b32 s1, s0 -; GFX10-W32-NEXT: s_mov_b32 s2, s0 -; GFX10-W32-NEXT: s_mov_b32 s3, s0 -; GFX10-W32-NEXT: s_mov_b32 s4, s0 -; GFX10-W32-NEXT: s_mov_b32 s5, s0 -; GFX10-W32-NEXT: s_mov_b32 s6, s0 -; GFX10-W32-NEXT: s_mov_b32 s7, s0 -; GFX10-W32-NEXT: v_add_f32_e32 v8, 2.0, v8 ; GFX10-W32-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10-W32-NEXT: s_mov_b32 s1, 0 +; GFX10-W32-NEXT: v_add_f32_e32 v8, 2.0, v8 ; GFX10-W32-NEXT: s_cbranch_execz .LBB31_4 ; GFX10-W32-NEXT: .LBB31_2: ; %loop ; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1925,11 +1892,10 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind { ; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-W32-NEXT: s_cbranch_vccz .LBB31_1 ; GFX10-W32-NEXT: ; %bb.3: -; GFX10-W32-NEXT: s_mov_b32 s1, -1 ; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX10-W32-NEXT: ; implicit-def: $vgpr8 ; GFX10-W32-NEXT: .LBB31_4: ; %break -; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, v5 @@ -1999,14 +1965,6 @@ define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind { ; GFX9-W64-NEXT: v_lshl_add_u32 v0, v2, 2, v0 ; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] -; GFX9-W64-NEXT: s_mov_b32 s0, 0 -; GFX9-W64-NEXT: s_mov_b32 s1, s0 -; GFX9-W64-NEXT: s_mov_b32 s2, s0 -; GFX9-W64-NEXT: s_mov_b32 s3, s0 -; GFX9-W64-NEXT: s_mov_b32 s4, s0 -; GFX9-W64-NEXT: s_mov_b32 s5, s0 -; GFX9-W64-NEXT: s_mov_b32 s6, s0 -; GFX9-W64-NEXT: s_mov_b32 s7, s0 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) @@ -2035,14 +1993,6 @@ define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind { ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: buffer_load_dword v0, v2, s[8:11], 0 offen ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 -; GFX10-W32-NEXT: s_mov_b32 s0, 0 -; GFX10-W32-NEXT: s_mov_b32 s1, s0 -; GFX10-W32-NEXT: s_mov_b32 s2, s0 -; GFX10-W32-NEXT: s_mov_b32 s3, s0 -; GFX10-W32-NEXT: s_mov_b32 s4, s0 -; GFX10-W32-NEXT: s_mov_b32 s5, s0 -; GFX10-W32-NEXT: s_mov_b32 s6, s0 -; GFX10-W32-NEXT: s_mov_b32 s7, s0 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) @@ -2079,18 +2029,10 @@ entry: define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { ; GFX9-W64-LABEL: test_nonvoid_return: ; GFX9-W64: ; %bb.0: -; GFX9-W64-NEXT: s_mov_b32 s0, 0 -; GFX9-W64-NEXT: s_mov_b64 s[8:9], exec -; GFX9-W64-NEXT: s_mov_b32 s1, s0 -; GFX9-W64-NEXT: s_mov_b32 s2, s0 -; GFX9-W64-NEXT: s_mov_b32 s3, s0 -; GFX9-W64-NEXT: s_mov_b32 s4, s0 -; GFX9-W64-NEXT: s_mov_b32 s5, s0 -; GFX9-W64-NEXT: s_mov_b32 s6, s0 -; GFX9-W64-NEXT: s_mov_b32 s7, s0 +; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 -; GFX9-W64-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) @@ -2098,18 +2040,10 @@ define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { ; ; GFX10-W32-LABEL: test_nonvoid_return: ; GFX10-W32: ; %bb.0: -; GFX10-W32-NEXT: s_mov_b32 s0, 0 -; GFX10-W32-NEXT: s_mov_b32 s8, exec_lo -; GFX10-W32-NEXT: s_mov_b32 s1, s0 -; GFX10-W32-NEXT: s_mov_b32 s2, s0 -; GFX10-W32-NEXT: s_mov_b32 s3, s0 -; GFX10-W32-NEXT: s_mov_b32 s4, s0 -; GFX10-W32-NEXT: s_mov_b32 s5, s0 -; GFX10-W32-NEXT: s_mov_b32 s6, s0 -; GFX10-W32-NEXT: s_mov_b32 s7, s0 +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D -; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) @@ -2128,20 +2062,11 @@ define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind { ; GFX9-W64-LABEL: test_nonvoid_return_unreachable: ; GFX9-W64: ; %bb.0: ; %entry -; GFX9-W64-NEXT: s_mov_b32 s4, 0 -; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec -; GFX9-W64-NEXT: s_mov_b32 s5, s4 -; GFX9-W64-NEXT: s_mov_b32 s6, s4 -; GFX9-W64-NEXT: s_mov_b32 s7, s4 -; GFX9-W64-NEXT: s_mov_b32 s8, s4 -; GFX9-W64-NEXT: s_mov_b32 s9, s4 -; GFX9-W64-NEXT: s_mov_b32 s10, s4 -; GFX9-W64-NEXT: s_mov_b32 s11, s4 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec -; GFX9-W64-NEXT: image_sample v0, v0, s[4:11], s[0:3] dmask:0x1 -; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 +; GFX9-W64-NEXT: s_and_b64 exec, exec, exec ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf +; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf ; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1 ; GFX9-W64-NEXT: s_cbranch_scc0 .LBB34_2 ; GFX9-W64-NEXT: ; %bb.1: ; %else @@ -2155,20 +2080,11 @@ define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) noun ; ; GFX10-W32-LABEL: test_nonvoid_return_unreachable: ; GFX10-W32: ; %bb.0: ; %entry -; GFX10-W32-NEXT: s_mov_b32 s4, 0 -; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10-W32-NEXT: s_mov_b32 s5, s4 -; GFX10-W32-NEXT: s_mov_b32 s6, s4 -; GFX10-W32-NEXT: s_mov_b32 s7, s4 -; GFX10-W32-NEXT: s_mov_b32 s8, s4 -; GFX10-W32-NEXT: s_mov_b32 s9, s4 -; GFX10-W32-NEXT: s_mov_b32 s10, s4 -; GFX10-W32-NEXT: s_mov_b32 s11, s4 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: image_sample v0, v0, s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D -; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, exec_lo ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1 ; GFX10-W32-NEXT: s_cbranch_scc0 .LBB34_2 ; GFX10-W32-NEXT: ; %bb.1: ; %else @@ -2215,33 +2131,17 @@ define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 { ; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1 ; GFX9-W64-NEXT: s_cbranch_scc0 .LBB35_2 ; GFX9-W64-NEXT: ; %bb.1: ; %else -; GFX9-W64-NEXT: s_mov_b32 s4, 0 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX9-W64-NEXT: s_mov_b32 s5, s4 -; GFX9-W64-NEXT: s_mov_b32 s6, s4 -; GFX9-W64-NEXT: s_mov_b32 s7, s4 -; GFX9-W64-NEXT: s_mov_b32 s8, s4 -; GFX9-W64-NEXT: s_mov_b32 s9, s4 -; GFX9-W64-NEXT: s_mov_b32 s10, s4 -; GFX9-W64-NEXT: s_mov_b32 s11, s4 -; GFX9-W64-NEXT: image_sample v[0:3], v[0:1], s[4:11], s[0:3] dmask:0xf +; GFX9-W64-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf ; GFX9-W64-NEXT: s_cbranch_execz .LBB35_3 ; GFX9-W64-NEXT: s_branch .LBB35_4 ; GFX9-W64-NEXT: .LBB35_2: ; GFX9-W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-W64-NEXT: .LBB35_3: ; %if -; GFX9-W64-NEXT: s_mov_b32 s4, 0 -; GFX9-W64-NEXT: s_mov_b32 s5, s4 -; GFX9-W64-NEXT: s_mov_b32 s6, s4 -; GFX9-W64-NEXT: s_mov_b32 s7, s4 -; GFX9-W64-NEXT: s_mov_b32 s8, s4 -; GFX9-W64-NEXT: s_mov_b32 s9, s4 -; GFX9-W64-NEXT: s_mov_b32 s10, s4 -; GFX9-W64-NEXT: s_mov_b32 s11, s4 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf +; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf ; GFX9-W64-NEXT: .LBB35_4: ; %end ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] ; GFX9-W64-NEXT: v_mov_b32_e32 v5, 1.0 @@ -2252,21 +2152,13 @@ define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 { ; GFX10-W32-LABEL: test_scc: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-W32-NEXT: s_mov_b32 s8, exec_lo +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1 ; GFX10-W32-NEXT: s_cbranch_scc0 .LBB35_2 ; GFX10-W32-NEXT: ; %bb.1: ; %else ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, 1 -; GFX10-W32-NEXT: s_mov_b32 s0, 0 -; GFX10-W32-NEXT: s_mov_b32 s1, s0 -; GFX10-W32-NEXT: s_mov_b32 s2, s0 -; GFX10-W32-NEXT: s_mov_b32 s3, s0 -; GFX10-W32-NEXT: s_mov_b32 s4, s0 -; GFX10-W32-NEXT: s_mov_b32 s5, s0 -; GFX10-W32-NEXT: s_mov_b32 s6, s0 -; GFX10-W32-NEXT: s_mov_b32 s7, s0 ; GFX10-W32-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-W32-NEXT: s_cbranch_execz .LBB35_3 ; GFX10-W32-NEXT: s_branch .LBB35_4 @@ -2275,17 +2167,9 @@ define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 { ; GFX10-W32-NEXT: .LBB35_3: ; %if ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_mov_b32 s0, 0 -; GFX10-W32-NEXT: s_mov_b32 s1, s0 -; GFX10-W32-NEXT: s_mov_b32 s2, s0 -; GFX10-W32-NEXT: s_mov_b32 s3, s0 -; GFX10-W32-NEXT: s_mov_b32 s4, s0 -; GFX10-W32-NEXT: s_mov_b32 s5, s0 -; GFX10-W32-NEXT: s_mov_b32 s6, s0 -; GFX10-W32-NEXT: s_mov_b32 s7, s0 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: .LBB35_4: ; %end -; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10-W32-NEXT: v_mov_b32_e32 v5, 1.0 ; GFX10-W32-NEXT: buffer_store_dword v5, v4, s[0:3], 0 idxen ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)