diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index f7d139adc63bac..f6b7d1ffc6d27c 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -249,6 +249,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, case ISD::STORE: case ISD::BUILD_VECTOR: case ISD::BITCAST: + case ISD::UNDEF: case ISD::EXTRACT_VECTOR_ELT: case ISD::INSERT_VECTOR_ELT: case ISD::EXTRACT_SUBVECTOR: @@ -516,6 +517,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, case ISD::STORE: case ISD::BUILD_VECTOR: case ISD::BITCAST: + case ISD::UNDEF: case ISD::EXTRACT_VECTOR_ELT: case ISD::INSERT_VECTOR_ELT: case ISD::INSERT_SUBVECTOR: diff --git a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll index 8df85ba872bfbf..3697946cb5c398 100644 --- a/llvm/test/CodeGen/AMDGPU/commute-shifts.ll +++ b/llvm/test/CodeGen/AMDGPU/commute-shifts.ll @@ -5,14 +5,6 @@ define amdgpu_ps float @main(float %arg0, float %arg1) #0 { ; SI-LABEL: main: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_mov_b32 s0, 0 -; SI-NEXT: s_mov_b32 s1, s0 -; SI-NEXT: s_mov_b32 s2, s0 -; SI-NEXT: s_mov_b32 s3, s0 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s0 -; SI-NEXT: s_mov_b32 s6, s0 -; SI-NEXT: s_mov_b32 s7, s0 ; SI-NEXT: image_load v2, v0, s[0:7] dmask:0x1 unorm ; SI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; SI-NEXT: v_and_b32_e32 v0, 7, v0 @@ -26,14 +18,6 @@ define amdgpu_ps float @main(float %arg0, float %arg1) #0 { ; ; VI-LABEL: main: ; VI: ; %bb.0: ; %bb -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_mov_b32 s1, s0 -; VI-NEXT: s_mov_b32 s2, s0 -; VI-NEXT: s_mov_b32 s3, s0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s0 -; VI-NEXT: s_mov_b32 s6, s0 -; VI-NEXT: s_mov_b32 s7, s0 ; VI-NEXT: image_load v2, v0, s[0:7] dmask:0x1 unorm ; VI-NEXT: v_cvt_i32_f32_e32 v0, v0 ; VI-NEXT: v_and_b32_e32 v0, 7, v0 diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll index 29fc098899ee5c..5d985850446cc3 100644 --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -213,7 +213,7 @@ if.else: ; preds = %entry br label %if.end if.end: ; preds = %if.else, %if.then - %call6.sink = phi <3 x i16> [ %call6, %if.else ], [ undef, %if.then ] + %call6.sink = phi <3 x i16> [ %call6, %if.else ], [ zeroinitializer, %if.then ] store <3 x i16> %call6.sink, <3 x i16> addrspace(1)* undef ret void } @@ -266,7 +266,7 @@ if.else: ; preds = %entry br label %if.end if.end: ; preds = %if.else, %if.then - %call6.sink = phi <3 x half> [ %call6, %if.else ], [ undef, %if.then ] + %call6.sink = phi <3 x half> [ %call6, %if.else ], [ zeroinitializer, %if.then ] store <3 x half> %call6.sink, <3 x half> addrspace(1)* undef ret void } diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll index 8af7575f03d06d..0b629efffbb30e 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll @@ -4,16 +4,8 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 { ; GCN-LABEL: _amdgpu_ps_main: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: s_mov_b32 s1, s0 -; GCN-NEXT: s_mov_b32 s2, s0 -; GCN-NEXT: s_mov_b32 s3, s0 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s0 -; GCN-NEXT: s_mov_b32 s6, s0 -; GCN-NEXT: s_mov_b32 s7, s0 ; GCN-NEXT: image_sample v[0:1], v[0:1], s[0:7], s[0:3] dmask:0x3 dim:SQ_RSRC_IMG_2D +; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_clause 0x1 ; GCN-NEXT: image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll index 6456c87a31fbfc..cbfd8ec5cb16e6 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -100,14 +100,7 @@ define <4 x i16> @vec_8xi16_extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16 ; GFX9-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-NEXT: s_branch .LBB0_4 ; GFX9-NEXT: .LBB0_2: -; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s8 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_mov_b32_e32 v4, s10 -; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: .LBB0_3: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -244,14 +237,7 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(<8 x i16> addrspace(1) * %p0, <8 x i ; GFX9-NEXT: s_cbranch_execz .LBB1_3 ; GFX9-NEXT: s_branch .LBB1_4 ; GFX9-NEXT: .LBB1_2: -; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s8 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_mov_b32_e32 v4, s10 -; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: .LBB1_3: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -386,14 +372,7 @@ define <4 x half> @vec_8xf16_extract_4xf16(<8 x half> addrspace(1) * %p0, <8 x h ; GFX9-NEXT: s_cbranch_execz .LBB2_3 ; GFX9-NEXT: s_branch .LBB2_4 ; GFX9-NEXT: .LBB2_2: -; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s8 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_mov_b32_e32 v4, s10 -; GFX9-NEXT: v_mov_b32_e32 v5, s11 +; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX9-NEXT: .LBB2_3: ; %T ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -567,22 +546,7 @@ define <4 x i16> @vec_16xi16_extract_4xi16(<16 x i16> addrspace(1) * %p0, <16 x ; GFX9-NEXT: s_cbranch_execz .LBB3_3 ; GFX9-NEXT: s_branch .LBB3_4 ; GFX9-NEXT: .LBB3_2: -; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s8 -; GFX9-NEXT: s_mov_b32 s12, s8 -; GFX9-NEXT: s_mov_b32 s13, s8 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_mov_b32 s15, s8 -; GFX9-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, s9 -; GFX9-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-NEXT: v_mov_b32_e32 v7, s11 -; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-NEXT: v_mov_b32_e32 v10, s14 -; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; GFX9-NEXT: .LBB3_3: ; %T ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc @@ -759,22 +723,7 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(<16 x i16> addrspace(1) * %p0, <16 ; GFX9-NEXT: s_cbranch_execz .LBB4_3 ; GFX9-NEXT: s_branch .LBB4_4 ; GFX9-NEXT: .LBB4_2: -; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s8 -; GFX9-NEXT: s_mov_b32 s12, s8 -; GFX9-NEXT: s_mov_b32 s13, s8 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_mov_b32 s15, s8 -; GFX9-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, s9 -; GFX9-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-NEXT: v_mov_b32_e32 v7, s11 -; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-NEXT: v_mov_b32_e32 v10, s14 -; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; GFX9-NEXT: .LBB4_3: ; %T ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc @@ -949,22 +898,7 @@ define <4 x half> @vec_16xf16_extract_4xf16(<16 x half> addrspace(1) * %p0, <16 ; GFX9-NEXT: s_cbranch_execz .LBB5_3 ; GFX9-NEXT: s_branch .LBB5_4 ; GFX9-NEXT: .LBB5_2: -; GFX9-NEXT: s_mov_b32 s8, 0 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_mov_b32 s10, s8 -; GFX9-NEXT: s_mov_b32 s11, s8 -; GFX9-NEXT: s_mov_b32 s12, s8 -; GFX9-NEXT: s_mov_b32 s13, s8 -; GFX9-NEXT: s_mov_b32 s14, s8 -; GFX9-NEXT: s_mov_b32 s15, s8 -; GFX9-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-NEXT: v_mov_b32_e32 v5, s9 -; GFX9-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-NEXT: v_mov_b32_e32 v7, s11 -; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: v_mov_b32_e32 v9, s13 -; GFX9-NEXT: v_mov_b32_e32 v10, s14 -; GFX9-NEXT: v_mov_b32_e32 v11, s15 +; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 ; GFX9-NEXT: .LBB5_3: ; %T ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off offset:16 glc diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index cc4ece6c7059f7..f742d2c0bda4d5 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -374,18 +374,10 @@ define <4 x float> @insertelement_to_sgpr() nounwind { ; GCN-LABEL: insertelement_to_sgpr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s12, 0 -; GCN-NEXT: s_mov_b32 s4, s12 -; GCN-NEXT: s_mov_b32 s5, s12 -; GCN-NEXT: s_mov_b32 s6, s12 -; GCN-NEXT: s_mov_b32 s7, s12 -; GCN-NEXT: s_mov_b32 s8, s12 -; GCN-NEXT: s_mov_b32 s9, s12 -; GCN-NEXT: s_mov_b32 s10, s12 -; GCN-NEXT: s_mov_b32 s11, s12 -; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1 +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[4:7] dmask:0x1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef diff --git a/llvm/test/CodeGen/AMDGPU/select-undef.ll b/llvm/test/CodeGen/AMDGPU/select-undef.ll index 6597d6784e0c23..f02cd3fc5e4e65 100644 --- a/llvm/test/CodeGen/AMDGPU/select-undef.ll +++ b/llvm/test/CodeGen/AMDGPU/select-undef.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}select_undef_lhs: ; GCN: s_waitcnt @@ -43,3 +43,220 @@ define void @select_undef_n2(float addrspace(1)* %a, i32 %c) { } declare float @llvm.amdgcn.rcp.f32(float) + + +; Make sure the vector undef isn't lowered into 0s. +; GCN-LABEL: {{^}}undef_v6f32: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v6f32(<6 x float> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <6 x float> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <6 x float>, <6 x float> addrspace(3)* undef + %add = fadd <6 x float> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <6 x float> %add, <6 x float> addrspace(3)* undef + ret void +} + +; GCN-LABEL: {{^}}undef_v6i32: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v6i32(<6 x i32> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <6 x i32> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <6 x i32>, <6 x i32> addrspace(3)* undef + %add = add <6 x i32> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <6 x i32> %add, <6 x i32> addrspace(3)* undef + ret void +} + +; Make sure the vector undef isn't lowered into 0s. +; GCN-LABEL: {{^}}undef_v5f32: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v5f32(<5 x float> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <5 x float> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <5 x float>, <5 x float> addrspace(3)* undef + %add = fadd <5 x float> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <5 x float> %add, <5 x float> addrspace(3)* undef + ret void +} + +; GCN-LABEL: {{^}}undef_v5i32: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v5i32(<5 x i32> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <5 x i32> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <5 x i32>, <5 x i32> addrspace(3)* undef + %add = add <5 x i32> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <5 x i32> %add, <5 x i32> addrspace(3)* undef + ret void +} + +; Make sure the vector undef isn't lowered into 0s. +; GCN-LABEL: {{^}}undef_v3f64: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v3f64(<3 x double> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <3 x double> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <3 x double>, <3 x double> addrspace(3)* %ptr + %add = fadd <3 x double> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <3 x double> %add, <3 x double> addrspace(3)* %ptr + ret void +} + +; GCN-LABEL: {{^}}undef_v3i64: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v3i64(<3 x i64> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <3 x i64> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <3 x i64>, <3 x i64> addrspace(3)* %ptr + %add = add <3 x i64> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <3 x i64> %add, <3 x i64> addrspace(3)* %ptr + ret void +} + +; Make sure the vector undef isn't lowered into 0s. +; GCN-LABEL: {{^}}undef_v4f16: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v4f16(<4 x half> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <4 x half> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <4 x half>, <4 x half> addrspace(3)* %ptr + %add = fadd <4 x half> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <4 x half> %add, <4 x half> addrspace(3)* %ptr + ret void +} + +; GCN-LABEL: {{^}}undef_v4i16: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v4i16(<4 x i16> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <4 x i16> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <4 x i16>, <4 x i16> addrspace(3)* %ptr + %add = add <4 x i16> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <4 x i16> %add, <4 x i16> addrspace(3)* %ptr + ret void +} + +; Make sure the vector undef isn't lowered into 0s. +; GCN-LABEL: {{^}}undef_v2f16: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v2f16(<2 x half> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <2 x half> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <2 x half>, <2 x half> addrspace(3)* %ptr + %add = fadd <2 x half> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <2 x half> %add, <2 x half> addrspace(3)* %ptr + ret void +} + +; GCN-LABEL: {{^}}undef_v2i16: +; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0 +; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0 +; GCN: s_cbranch_vccnz +define amdgpu_kernel void @undef_v2i16(<2 x i16> addrspace(3)* %ptr, i1 %cond) { +entry: + br label %loop + +loop: + %phi = phi <2 x i16> [ undef, %entry ], [ %add, %loop ] + %load = load volatile <2 x i16>, <2 x i16> addrspace(3)* %ptr + %add = add <2 x i16> %load, %phi + br i1 %cond, label %loop, label %ret + +ret: + store volatile <2 x i16> %add, <2 x i16> addrspace(3)* %ptr + ret void +} + +; We were expanding undef vectors into zero vectors. Optimizations +; would then see we used no elements of the vector, and reform the +; undef vector resulting in a combiner loop. +; GCN-LABEL: {{^}}inf_loop_undef_vector: +; GCN: s_waitcnt +; GCN-NEXT: v_mad_u64_u32 +; GCN-NEXT: v_mul_lo_u32 +; GCN-NEXT: v_mul_lo_u32 +; GCN-NEXT: v_add3_u32 +; GCN-NEXT: global_store_dwordx2 +define void @inf_loop_undef_vector(<6 x float> %arg, float %arg1, i64 %arg2) { + %i = insertelement <6 x float> %arg, float %arg1, i64 2 + %i3 = bitcast <6 x float> %i to <3 x i64> + %i4 = extractelement <3 x i64> %i3, i64 0 + %i5 = extractelement <3 x i64> %i3, i64 1 + %i6 = mul i64 %i5, %arg2 + %i7 = add i64 %i6, %i4 + store volatile i64 %i7, i64 addrspace(1)* undef, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index ada6c1da04e2ca..7080c84f7b50a2 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -1397,28 +1397,20 @@ bb7: ; preds = %bb4 define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, float %arg3) #0 { ; SI-LABEL: if_after_kill_block: ; SI: ; %bb.0: ; %bb -; SI-NEXT: s_mov_b64 s[2:3], exec +; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: s_wqm_b64 exec, exec -; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; SI-NEXT: s_cbranch_execz .LBB13_3 ; SI-NEXT: ; %bb.1: ; %bb3 ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc ; SI-NEXT: s_cbranch_scc0 .LBB13_6 ; SI-NEXT: ; %bb.2: ; %bb3 ; SI-NEXT: s_andn2_b64 exec, exec, vcc ; SI-NEXT: .LBB13_3: ; %bb4 -; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: s_mov_b32 s1, s0 -; SI-NEXT: s_mov_b32 s2, s0 -; SI-NEXT: s_mov_b32 s3, s0 -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s0 -; SI-NEXT: s_mov_b32 s6, s0 -; SI-NEXT: s_mov_b32 s7, s0 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 @@ -1439,28 +1431,20 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, ; ; GFX10-WAVE64-LABEL: if_after_kill_block: ; GFX10-WAVE64: ; %bb.0: ; %bb -; GFX10-WAVE64-NEXT: s_mov_b64 s[2:3], exec +; GFX10-WAVE64-NEXT: s_mov_b64 s[0:1], exec ; GFX10-WAVE64-NEXT: s_wqm_b64 exec, exec ; GFX10-WAVE64-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1 -; GFX10-WAVE64-NEXT: s_mov_b32 s0, 0 -; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX10-WAVE64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10-WAVE64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX10-WAVE64-NEXT: s_cbranch_execz .LBB13_3 ; GFX10-WAVE64-NEXT: ; %bb.1: ; %bb3 ; GFX10-WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; GFX10-WAVE64-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc +; GFX10-WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], vcc ; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB13_6 ; GFX10-WAVE64-NEXT: ; %bb.2: ; %bb3 ; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, vcc ; GFX10-WAVE64-NEXT: .LBB13_3: ; %bb4 -; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX10-WAVE64-NEXT: s_mov_b32 s1, s0 -; GFX10-WAVE64-NEXT: s_mov_b32 s2, s0 -; GFX10-WAVE64-NEXT: s_mov_b32 s3, s0 -; GFX10-WAVE64-NEXT: s_mov_b32 s4, s0 -; GFX10-WAVE64-NEXT: s_mov_b32 s5, s0 -; GFX10-WAVE64-NEXT: s_mov_b32 s6, s0 -; GFX10-WAVE64-NEXT: s_mov_b32 s7, s0 +; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-WAVE64-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D ; GFX10-WAVE64-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 @@ -1479,28 +1463,20 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, ; ; GFX10-WAVE32-LABEL: if_after_kill_block: ; GFX10-WAVE32: ; %bb.0: ; %bb -; GFX10-WAVE32-NEXT: s_mov_b32 s1, exec_lo +; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-WAVE32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-WAVE32-NEXT: v_cmp_nle_f32_e32 vcc_lo, 0, v1 -; GFX10-WAVE32-NEXT: s_mov_b32 s0, 0 -; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-WAVE32-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-WAVE32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB13_3 ; GFX10-WAVE32-NEXT: ; %bb.1: ; %bb3 ; GFX10-WAVE32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 -; GFX10-WAVE32-NEXT: s_andn2_b32 s1, s1, vcc_lo +; GFX10-WAVE32-NEXT: s_andn2_b32 s0, s0, vcc_lo ; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB13_6 ; GFX10-WAVE32-NEXT: ; %bb.2: ; %bb3 ; GFX10-WAVE32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo ; GFX10-WAVE32-NEXT: .LBB13_3: ; %bb4 -; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX10-WAVE32-NEXT: s_mov_b32 s1, s0 -; GFX10-WAVE32-NEXT: s_mov_b32 s2, s0 -; GFX10-WAVE32-NEXT: s_mov_b32 s3, s0 -; GFX10-WAVE32-NEXT: s_mov_b32 s4, s0 -; GFX10-WAVE32-NEXT: s_mov_b32 s5, s0 -; GFX10-WAVE32-NEXT: s_mov_b32 s6, s0 -; GFX10-WAVE32-NEXT: s_mov_b32 s7, s0 +; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-WAVE32-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D ; GFX10-WAVE32-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 @@ -1519,29 +1495,22 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, ; ; GFX11-LABEL: if_after_kill_block: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_mov_b64 s[2:3], exec +; GFX11-NEXT: s_mov_b64 s[0:1], exec ; GFX11-NEXT: s_wqm_b64 exec, exec -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b64 s[4:5], exec +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[2:3], exec ; GFX11-NEXT: v_cmpx_nle_f32_e32 0, v1 -; GFX11-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX11-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11-NEXT: s_cbranch_execz .LBB13_3 ; GFX11-NEXT: ; %bb.1: ; %bb3 ; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], vcc +; GFX11-NEXT: s_and_not1_b64 s[0:1], s[0:1], vcc ; GFX11-NEXT: s_cbranch_scc0 .LBB13_6 ; GFX11-NEXT: ; %bb.2: ; %bb3 ; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc ; GFX11-NEXT: .LBB13_3: ; %bb4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s2, s0 -; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: s_mov_b32 s5, s0 -; GFX11-NEXT: s_mov_b32 s6, s0 -; GFX11-NEXT: s_mov_b32 s7, s0 +; GFX11-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX11-NEXT: image_sample_c v0, v[2:3], s[0:7], s[0:3] dmask:0x10 dim:SQ_RSRC_IMG_1D ; GFX11-NEXT: s_mov_b64 s[0:1], exec ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1584,19 +1553,11 @@ bb9: ; preds = %bb4 define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) { ; SI-LABEL: cbranch_kill: ; SI: ; %bb.0: ; %.entry -; SI-NEXT: s_mov_b32 s4, 0 ; SI-NEXT: s_mov_b64 s[0:1], exec ; SI-NEXT: v_mov_b32_e32 v4, 0 ; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: s_mov_b32 s5, s4 -; SI-NEXT: s_mov_b32 s6, s4 -; SI-NEXT: s_mov_b32 s7, s4 -; SI-NEXT: s_mov_b32 s8, s4 -; SI-NEXT: s_mov_b32 s9, s4 -; SI-NEXT: s_mov_b32 s10, s4 -; SI-NEXT: s_mov_b32 s11, s4 -; SI-NEXT: image_sample_l v1, v[1:4], s[4:11], s[0:3] dmask:0x1 da +; SI-NEXT: image_sample_l v1, v[1:4], s[0:7], s[0:3] dmask:0x1 da ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cmp_ge_f32_e32 vcc, 0, v1 ; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc @@ -1627,16 +1588,8 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) { ; GFX10-WAVE64-LABEL: cbranch_kill: ; GFX10-WAVE64: ; %bb.0: ; %.entry ; GFX10-WAVE64-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-WAVE64-NEXT: s_mov_b32 s4, 0 ; GFX10-WAVE64-NEXT: s_mov_b64 s[0:1], exec -; GFX10-WAVE64-NEXT: s_mov_b32 s5, s4 -; GFX10-WAVE64-NEXT: s_mov_b32 s6, s4 -; GFX10-WAVE64-NEXT: s_mov_b32 s7, s4 -; GFX10-WAVE64-NEXT: s_mov_b32 s8, s4 -; GFX10-WAVE64-NEXT: s_mov_b32 s9, s4 -; GFX10-WAVE64-NEXT: s_mov_b32 s10, s4 -; GFX10-WAVE64-NEXT: s_mov_b32 s11, s4 -; GFX10-WAVE64-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX10-WAVE64-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-WAVE64-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE64-NEXT: v_cmp_ge_f32_e32 vcc, 0, v1 ; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[2:3], vcc @@ -1667,16 +1620,8 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) { ; GFX10-WAVE32-LABEL: cbranch_kill: ; GFX10-WAVE32: ; %bb.0: ; %.entry ; GFX10-WAVE32-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-WAVE32-NEXT: s_mov_b32 s4, 0 ; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo -; GFX10-WAVE32-NEXT: s_mov_b32 s5, s4 -; GFX10-WAVE32-NEXT: s_mov_b32 s6, s4 -; GFX10-WAVE32-NEXT: s_mov_b32 s7, s4 -; GFX10-WAVE32-NEXT: s_mov_b32 s8, s4 -; GFX10-WAVE32-NEXT: s_mov_b32 s9, s4 -; GFX10-WAVE32-NEXT: s_mov_b32 s10, s4 -; GFX10-WAVE32-NEXT: s_mov_b32 s11, s4 -; GFX10-WAVE32-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX10-WAVE32-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-WAVE32-NEXT: s_waitcnt vmcnt(0) ; GFX10-WAVE32-NEXT: v_cmp_ge_f32_e32 vcc_lo, 0, v1 ; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s1, vcc_lo @@ -1707,16 +1652,8 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) { ; GFX11-LABEL: cbranch_kill: ; GFX11: ; %bb.0: ; %.entry ; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], exec -; GFX11-NEXT: s_mov_b32 s5, s4 -; GFX11-NEXT: s_mov_b32 s6, s4 -; GFX11-NEXT: s_mov_b32 s7, s4 -; GFX11-NEXT: s_mov_b32 s8, s4 -; GFX11-NEXT: s_mov_b32 s9, s4 -; GFX11-NEXT: s_mov_b32 s10, s4 -; GFX11-NEXT: s_mov_b32 s11, s4 -; GFX11-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX11-NEXT: image_sample_l v1, [v1, v1, v1, v2], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX11-NEXT: s_mov_b64 s[2:3], exec ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cmpx_ge_f32_e32 0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/v1024.ll b/llvm/test/CodeGen/AMDGPU/v1024.ll index a5e0454a36344e..1326ba437f94ff 100644 --- a/llvm/test/CodeGen/AMDGPU/v1024.ll +++ b/llvm/test/CodeGen/AMDGPU/v1024.ll @@ -10,6 +10,7 @@ define amdgpu_kernel void @test_v1024() { entry: %alloca = alloca <32 x i32>, align 16, addrspace(5) %cast = bitcast <32 x i32> addrspace(5)* %alloca to i8 addrspace(5)* + call void @llvm.memset.p5i8.i32(i8 addrspace(5)* %cast, i8 0, i32 128, i1 false) br i1 undef, label %if.then.i.i, label %if.else.i if.then.i.i: ; preds = %entry @@ -24,6 +25,7 @@ if.then.i62.i: ; preds = %if.else.i, %if.then ret void } +declare void @llvm.memset.p5i8.i32(i8 addrspace(5)* nocapture readonly, i8, i32, i1 immarg) declare void @llvm.memcpy.p5i8.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8 addrspace(5)* nocapture readonly, i64, i1 immarg) declare void @llvm.memcpy.p1i8.p5i8.i64(i8 addrspace(1)* nocapture writeonly, i8 addrspace(5)* nocapture readonly, i64, i1 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll index 5164b072a6ddc7..ed0de729dafdfa 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -14,7 +14,6 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: v_mov_b32_e32 v36, v16 @@ -22,13 +21,6 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX9-NEXT: v_mov_b32_e32 v34, v14 ; GFX9-NEXT: v_mov_b32_e32 v33, v13 ; GFX9-NEXT: v_mov_b32_e32 v32, v12 -; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_mov_b32 s6, s4 -; GFX9-NEXT: s_mov_b32 s7, s4 -; GFX9-NEXT: s_mov_b32 s8, s4 -; GFX9-NEXT: s_mov_b32 s9, s4 -; GFX9-NEXT: s_mov_b32 s10, s4 -; GFX9-NEXT: s_mov_b32 s11, s4 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -82,16 +74,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX10-NEXT: v_mov_b32_e32 v34, v14 ; GFX10-NEXT: v_mov_b32_e32 v33, v13 ; GFX10-NEXT: v_mov_b32_e32 v32, v12 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 -; GFX10-NEXT: s_mov_b32 s5, s4 -; GFX10-NEXT: s_mov_b32 s6, s4 -; GFX10-NEXT: s_mov_b32 s7, s4 -; GFX10-NEXT: s_mov_b32 s8, s4 -; GFX10-NEXT: s_mov_b32 s9, s4 -; GFX10-NEXT: s_mov_b32 s10, s4 -; GFX10-NEXT: s_mov_b32 s11, s4 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -145,16 +129,8 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX11-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15 ; GFX11-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 ; GFX11-NEXT: v_mov_b32_e32 v32, v12 -; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s33, 2 ; GFX11-NEXT: s_mov_b32 s33, s32 -; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_mov_b32 s2, s0 -; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: s_mov_b32 s5, s0 -; GFX11-NEXT: s_mov_b32 s6, s0 -; GFX11-NEXT: s_mov_b32 s7, s0 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 @@ -225,65 +201,41 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v40, s33, 10 -; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: v_writelane_b32 v40, s36, 2 -; GFX9-NEXT: v_writelane_b32 v40, s37, 3 -; GFX9-NEXT: v_writelane_b32 v40, s38, 4 -; GFX9-NEXT: v_writelane_b32 v40, s39, 5 -; GFX9-NEXT: v_writelane_b32 v40, s40, 6 -; GFX9-NEXT: v_writelane_b32 v40, s41, 7 +; GFX9-NEXT: v_writelane_b32 v40, s33, 2 ; GFX9-NEXT: s_mov_b32 s33, s32 -; GFX9-NEXT: v_writelane_b32 v40, s42, 8 -; GFX9-NEXT: s_mov_b32 s36, 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v40, s43, 9 ; GFX9-NEXT: v_mov_b32_e32 v45, v16 ; GFX9-NEXT: v_mov_b32_e32 v44, v15 ; GFX9-NEXT: v_mov_b32_e32 v43, v14 ; GFX9-NEXT: v_mov_b32_e32 v42, v13 ; GFX9-NEXT: v_mov_b32_e32 v41, v12 -; GFX9-NEXT: s_mov_b32 s37, s36 -; GFX9-NEXT: s_mov_b32 s38, s36 -; GFX9-NEXT: s_mov_b32 s39, s36 -; GFX9-NEXT: s_mov_b32 s40, s36 -; GFX9-NEXT: s_mov_b32 s41, s36 -; GFX9-NEXT: s_mov_b32 s42, s36 -; GFX9-NEXT: s_mov_b32 s43, s36 -; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1 +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1 +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX9-NEXT: v_readlane_b32 s43, v40, 9 -; GFX9-NEXT: v_readlane_b32 s42, v40, 8 -; GFX9-NEXT: v_readlane_b32 s41, v40, 7 -; GFX9-NEXT: v_readlane_b32 s40, v40, 6 -; GFX9-NEXT: v_readlane_b32 s39, v40, 5 -; GFX9-NEXT: v_readlane_b32 s38, v40, 4 -; GFX9-NEXT: v_readlane_b32 s37, v40, 3 -; GFX9-NEXT: v_readlane_b32 s36, v40, 2 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: v_readlane_b32 s33, v40, 10 +; GFX9-NEXT: v_readlane_b32 s33, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] @@ -298,66 +250,42 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 -; GFX10-NEXT: v_writelane_b32 v40, s33, 10 +; GFX10-NEXT: v_writelane_b32 v40, s33, 2 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_getpc_b64 s[4:5] +; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 +; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v41, v16 ; GFX10-NEXT: v_mov_b32_e32 v42, v15 ; GFX10-NEXT: v_mov_b32_e32 v43, v14 -; GFX10-NEXT: v_mov_b32_e32 v44, v13 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: v_mov_b32_e32 v44, v13 ; GFX10-NEXT: v_mov_b32_e32 v45, v12 -; GFX10-NEXT: v_writelane_b32 v40, s36, 2 -; GFX10-NEXT: s_mov_b32 s36, 0 -; GFX10-NEXT: v_writelane_b32 v40, s37, 3 -; GFX10-NEXT: s_mov_b32 s37, s36 -; GFX10-NEXT: v_writelane_b32 v40, s38, 4 -; GFX10-NEXT: s_mov_b32 s38, s36 -; GFX10-NEXT: v_writelane_b32 v40, s39, 5 -; GFX10-NEXT: s_mov_b32 s39, s36 -; GFX10-NEXT: v_writelane_b32 v40, s40, 6 -; GFX10-NEXT: s_mov_b32 s40, s36 -; GFX10-NEXT: v_writelane_b32 v40, s41, 7 -; GFX10-NEXT: s_mov_b32 s41, s36 -; GFX10-NEXT: v_writelane_b32 v40, s42, 8 -; GFX10-NEXT: s_mov_b32 s42, s36 -; GFX10-NEXT: v_writelane_b32 v40, s43, 9 -; GFX10-NEXT: s_mov_b32 s43, s36 -; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_getpc_b64 s[4:5] -; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 -; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] -; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_clause 0x4 ; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 -; GFX10-NEXT: v_readlane_b32 s43, v40, 9 -; GFX10-NEXT: v_readlane_b32 s42, v40, 8 -; GFX10-NEXT: v_readlane_b32 s41, v40, 7 -; GFX10-NEXT: v_readlane_b32 s40, v40, 6 -; GFX10-NEXT: v_readlane_b32 s39, v40, 5 -; GFX10-NEXT: v_readlane_b32 s38, v40, 4 -; GFX10-NEXT: v_readlane_b32 s37, v40, 3 -; GFX10-NEXT: v_readlane_b32 s36, v40, 2 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 -; GFX10-NEXT: v_readlane_b32 s33, v40, 10 +; GFX10-NEXT: v_readlane_b32 s33, v40, 2 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 @@ -372,7 +300,7 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s32 offset:20 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v40, s33, 10 +; GFX11-NEXT: v_writelane_b32 v40, s33, 2 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_clause 0x4 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:16 @@ -380,56 +308,32 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, ; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:8 ; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:4 ; GFX11-NEXT: scratch_store_b32 off, v45, s33 +; GFX11-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_add_i32 s32, s32, 32 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_dual_mov_b32 v41, v16 :: v_dual_mov_b32 v42, v15 ; GFX11-NEXT: v_dual_mov_b32 v43, v14 :: v_dual_mov_b32 v44, v13 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: v_mov_b32_e32 v45, v12 -; GFX11-NEXT: v_writelane_b32 v40, s36, 2 -; GFX11-NEXT: s_mov_b32 s36, 0 -; GFX11-NEXT: v_writelane_b32 v40, s37, 3 -; GFX11-NEXT: s_mov_b32 s37, s36 -; GFX11-NEXT: v_writelane_b32 v40, s38, 4 -; GFX11-NEXT: s_mov_b32 s38, s36 -; GFX11-NEXT: v_writelane_b32 v40, s39, 5 -; GFX11-NEXT: s_mov_b32 s39, s36 -; GFX11-NEXT: v_writelane_b32 v40, s40, 6 -; GFX11-NEXT: s_mov_b32 s40, s36 -; GFX11-NEXT: v_writelane_b32 v40, s41, 7 -; GFX11-NEXT: s_mov_b32 s41, s36 -; GFX11-NEXT: v_writelane_b32 v40, s42, 8 -; GFX11-NEXT: s_mov_b32 s42, s36 -; GFX11-NEXT: v_writelane_b32 v40, s43, 9 -; GFX11-NEXT: s_mov_b32 s43, s36 -; GFX11-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[36:43], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX11-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_clause 0x4 ; GFX11-NEXT: scratch_load_b32 v45, off, s33 ; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:4 ; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:8 ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:12 ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:16 -; GFX11-NEXT: v_readlane_b32 s43, v40, 9 -; GFX11-NEXT: v_readlane_b32 s42, v40, 8 -; GFX11-NEXT: v_readlane_b32 s41, v40, 7 -; GFX11-NEXT: v_readlane_b32 s40, v40, 6 -; GFX11-NEXT: v_readlane_b32 s39, v40, 5 -; GFX11-NEXT: v_readlane_b32 s38, v40, 4 -; GFX11-NEXT: v_readlane_b32 s37, v40, 3 -; GFX11-NEXT: v_readlane_b32 s36, v40, 2 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 -; GFX11-NEXT: v_readlane_b32 s33, v40, 10 +; GFX11-NEXT: v_readlane_b32 s33, v40, 2 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_load_b32 v40, off, s32 offset:20 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index dc85462631d4a7..16c30174657a5f 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -1833,87 +1833,54 @@ main_body: define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind { ; GFX9-W64-LABEL: test_loop_vcc: ; GFX9-W64: ; %bb.0: ; %entry -; GFX9-W64-NEXT: s_mov_b64 s[8:9], exec +; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v7, v3 ; GFX9-W64-NEXT: v_mov_b32_e32 v6, v2 ; GFX9-W64-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-W64-NEXT: s_and_b64 exec, exec, s[8:9] -; GFX9-W64-NEXT: s_mov_b32 s0, 0 -; GFX9-W64-NEXT: s_mov_b32 s1, s0 -; GFX9-W64-NEXT: s_mov_b32 s2, s0 -; GFX9-W64-NEXT: s_mov_b32 s3, s0 -; GFX9-W64-NEXT: s_mov_b32 s4, s0 -; GFX9-W64-NEXT: s_mov_b32 s5, s0 -; GFX9-W64-NEXT: s_mov_b32 s6, s0 -; GFX9-W64-NEXT: s_mov_b32 s7, s0 +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: image_store v[4:7], v0, s[0:7] dmask:0xf unorm ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-W64-NEXT: s_mov_b32 s10, 0x40e00000 +; GFX9-W64-NEXT: s_mov_b32 s4, 0x40e00000 ; GFX9-W64-NEXT: s_branch .LBB31_2 ; GFX9-W64-NEXT: .LBB31_1: ; %body ; GFX9-W64-NEXT: ; in Loop: Header=BB31_2 Depth=1 -; GFX9-W64-NEXT: s_mov_b32 s1, s0 -; GFX9-W64-NEXT: s_mov_b32 s2, s0 -; GFX9-W64-NEXT: s_mov_b32 s3, s0 -; GFX9-W64-NEXT: s_mov_b32 s4, s0 -; GFX9-W64-NEXT: s_mov_b32 s5, s0 -; GFX9-W64-NEXT: s_mov_b32 s6, s0 -; GFX9-W64-NEXT: s_mov_b32 s7, s0 ; GFX9-W64-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf ; GFX9-W64-NEXT: v_add_f32_e32 v8, 2.0, v8 -; GFX9-W64-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-W64-NEXT: s_cbranch_execz .LBB31_4 ; GFX9-W64-NEXT: .LBB31_2: ; %loop ; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s10, v8 +; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s4, v8 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-W64-NEXT: v_mov_b32_e32 v2, v6 ; GFX9-W64-NEXT: v_mov_b32_e32 v3, v7 ; GFX9-W64-NEXT: s_cbranch_vccz .LBB31_1 ; GFX9-W64-NEXT: ; %bb.3: -; GFX9-W64-NEXT: s_mov_b64 s[2:3], -1 ; GFX9-W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7 ; GFX9-W64-NEXT: ; implicit-def: $vgpr8 ; GFX9-W64-NEXT: .LBB31_4: ; %break -; GFX9-W64-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: ; return to shader part epilog ; ; GFX10-W32-LABEL: test_loop_vcc: ; GFX10-W32: ; %bb.0: ; %entry -; GFX10-W32-NEXT: s_mov_b32 s8, exec_lo +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-W32-NEXT: s_mov_b32 s0, 0 -; GFX10-W32-NEXT: s_mov_b32 s1, s0 -; GFX10-W32-NEXT: s_mov_b32 s2, s0 -; GFX10-W32-NEXT: s_mov_b32 s3, s0 -; GFX10-W32-NEXT: s_mov_b32 s4, s0 -; GFX10-W32-NEXT: s_mov_b32 s5, s0 -; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8 -; GFX10-W32-NEXT: s_mov_b32 s6, s0 -; GFX10-W32-NEXT: s_mov_b32 s7, s0 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-W32-NEXT: image_store v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: s_branch .LBB31_2 ; GFX10-W32-NEXT: .p2align 6 ; GFX10-W32-NEXT: .LBB31_1: ; %body ; GFX10-W32-NEXT: ; in Loop: Header=BB31_2 Depth=1 -; GFX10-W32-NEXT: s_mov_b32 s1, s0 -; GFX10-W32-NEXT: s_mov_b32 s2, s0 -; GFX10-W32-NEXT: s_mov_b32 s3, s0 -; GFX10-W32-NEXT: s_mov_b32 s4, s0 -; GFX10-W32-NEXT: s_mov_b32 s5, s0 -; GFX10-W32-NEXT: s_mov_b32 s6, s0 -; GFX10-W32-NEXT: s_mov_b32 s7, s0 -; GFX10-W32-NEXT: v_add_f32_e32 v8, 2.0, v8 ; GFX10-W32-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D -; GFX10-W32-NEXT: s_mov_b32 s1, 0 +; GFX10-W32-NEXT: v_add_f32_e32 v8, 2.0, v8 ; GFX10-W32-NEXT: s_cbranch_execz .LBB31_4 ; GFX10-W32-NEXT: .LBB31_2: ; %loop ; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1925,11 +1892,10 @@ define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind { ; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-W32-NEXT: s_cbranch_vccz .LBB31_1 ; GFX10-W32-NEXT: ; %bb.3: -; GFX10-W32-NEXT: s_mov_b32 s1, -1 ; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX10-W32-NEXT: ; implicit-def: $vgpr8 ; GFX10-W32-NEXT: .LBB31_4: ; %break -; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, v5 @@ -1999,14 +1965,6 @@ define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind { ; GFX9-W64-NEXT: v_lshl_add_u32 v0, v2, 2, v0 ; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] -; GFX9-W64-NEXT: s_mov_b32 s0, 0 -; GFX9-W64-NEXT: s_mov_b32 s1, s0 -; GFX9-W64-NEXT: s_mov_b32 s2, s0 -; GFX9-W64-NEXT: s_mov_b32 s3, s0 -; GFX9-W64-NEXT: s_mov_b32 s4, s0 -; GFX9-W64-NEXT: s_mov_b32 s5, s0 -; GFX9-W64-NEXT: s_mov_b32 s6, s0 -; GFX9-W64-NEXT: s_mov_b32 s7, s0 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) @@ -2035,14 +1993,6 @@ define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind { ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: buffer_load_dword v0, v2, s[8:11], 0 offen ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 -; GFX10-W32-NEXT: s_mov_b32 s0, 0 -; GFX10-W32-NEXT: s_mov_b32 s1, s0 -; GFX10-W32-NEXT: s_mov_b32 s2, s0 -; GFX10-W32-NEXT: s_mov_b32 s3, s0 -; GFX10-W32-NEXT: s_mov_b32 s4, s0 -; GFX10-W32-NEXT: s_mov_b32 s5, s0 -; GFX10-W32-NEXT: s_mov_b32 s6, s0 -; GFX10-W32-NEXT: s_mov_b32 s7, s0 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) @@ -2079,18 +2029,10 @@ entry: define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { ; GFX9-W64-LABEL: test_nonvoid_return: ; GFX9-W64: ; %bb.0: -; GFX9-W64-NEXT: s_mov_b32 s0, 0 -; GFX9-W64-NEXT: s_mov_b64 s[8:9], exec -; GFX9-W64-NEXT: s_mov_b32 s1, s0 -; GFX9-W64-NEXT: s_mov_b32 s2, s0 -; GFX9-W64-NEXT: s_mov_b32 s3, s0 -; GFX9-W64-NEXT: s_mov_b32 s4, s0 -; GFX9-W64-NEXT: s_mov_b32 s5, s0 -; GFX9-W64-NEXT: s_mov_b32 s6, s0 -; GFX9-W64-NEXT: s_mov_b32 s7, s0 +; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec ; GFX9-W64-NEXT: s_wqm_b64 exec, exec ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 -; GFX9-W64-NEXT: s_and_b64 exec, exec, s[8:9] +; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) @@ -2098,18 +2040,10 @@ define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { ; ; GFX10-W32-LABEL: test_nonvoid_return: ; GFX10-W32: ; %bb.0: -; GFX10-W32-NEXT: s_mov_b32 s0, 0 -; GFX10-W32-NEXT: s_mov_b32 s8, exec_lo -; GFX10-W32-NEXT: s_mov_b32 s1, s0 -; GFX10-W32-NEXT: s_mov_b32 s2, s0 -; GFX10-W32-NEXT: s_mov_b32 s3, s0 -; GFX10-W32-NEXT: s_mov_b32 s4, s0 -; GFX10-W32-NEXT: s_mov_b32 s5, s0 -; GFX10-W32-NEXT: s_mov_b32 s6, s0 -; GFX10-W32-NEXT: s_mov_b32 s7, s0 +; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D -; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) @@ -2128,20 +2062,11 @@ define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind { define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind { ; GFX9-W64-LABEL: test_nonvoid_return_unreachable: ; GFX9-W64: ; %bb.0: ; %entry -; GFX9-W64-NEXT: s_mov_b32 s4, 0 -; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec -; GFX9-W64-NEXT: s_mov_b32 s5, s4 -; GFX9-W64-NEXT: s_mov_b32 s6, s4 -; GFX9-W64-NEXT: s_mov_b32 s7, s4 -; GFX9-W64-NEXT: s_mov_b32 s8, s4 -; GFX9-W64-NEXT: s_mov_b32 s9, s4 -; GFX9-W64-NEXT: s_mov_b32 s10, s4 -; GFX9-W64-NEXT: s_mov_b32 s11, s4 ; GFX9-W64-NEXT: s_wqm_b64 exec, exec -; GFX9-W64-NEXT: image_sample v0, v0, s[4:11], s[0:3] dmask:0x1 -; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 +; GFX9-W64-NEXT: s_and_b64 exec, exec, exec ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) -; GFX9-W64-NEXT: image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf +; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf ; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1 ; GFX9-W64-NEXT: s_cbranch_scc0 .LBB34_2 ; GFX9-W64-NEXT: ; %bb.1: ; %else @@ -2155,20 +2080,11 @@ define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) noun ; ; GFX10-W32-LABEL: test_nonvoid_return_unreachable: ; GFX10-W32: ; %bb.0: ; %entry -; GFX10-W32-NEXT: s_mov_b32 s4, 0 -; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo -; GFX10-W32-NEXT: s_mov_b32 s5, s4 -; GFX10-W32-NEXT: s_mov_b32 s6, s4 -; GFX10-W32-NEXT: s_mov_b32 s7, s4 -; GFX10-W32-NEXT: s_mov_b32 s8, s4 -; GFX10-W32-NEXT: s_mov_b32 s9, s4 -; GFX10-W32-NEXT: s_mov_b32 s10, s4 -; GFX10-W32-NEXT: s_mov_b32 s11, s4 ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10-W32-NEXT: image_sample v0, v0, s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D -; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1 +; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, exec_lo ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) -; GFX10-W32-NEXT: image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1 ; GFX10-W32-NEXT: s_cbranch_scc0 .LBB34_2 ; GFX10-W32-NEXT: ; %bb.1: ; %else @@ -2215,33 +2131,17 @@ define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 { ; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1 ; GFX9-W64-NEXT: s_cbranch_scc0 .LBB35_2 ; GFX9-W64-NEXT: ; %bb.1: ; %else -; GFX9-W64-NEXT: s_mov_b32 s4, 0 ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-W64-NEXT: v_mov_b32_e32 v1, 1 -; GFX9-W64-NEXT: s_mov_b32 s5, s4 -; GFX9-W64-NEXT: s_mov_b32 s6, s4 -; GFX9-W64-NEXT: s_mov_b32 s7, s4 -; GFX9-W64-NEXT: s_mov_b32 s8, s4 -; GFX9-W64-NEXT: s_mov_b32 s9, s4 -; GFX9-W64-NEXT: s_mov_b32 s10, s4 -; GFX9-W64-NEXT: s_mov_b32 s11, s4 -; GFX9-W64-NEXT: image_sample v[0:3], v[0:1], s[4:11], s[0:3] dmask:0xf +; GFX9-W64-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf ; GFX9-W64-NEXT: s_cbranch_execz .LBB35_3 ; GFX9-W64-NEXT: s_branch .LBB35_4 ; GFX9-W64-NEXT: .LBB35_2: ; GFX9-W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX9-W64-NEXT: .LBB35_3: ; %if -; GFX9-W64-NEXT: s_mov_b32 s4, 0 -; GFX9-W64-NEXT: s_mov_b32 s5, s4 -; GFX9-W64-NEXT: s_mov_b32 s6, s4 -; GFX9-W64-NEXT: s_mov_b32 s7, s4 -; GFX9-W64-NEXT: s_mov_b32 s8, s4 -; GFX9-W64-NEXT: s_mov_b32 s9, s4 -; GFX9-W64-NEXT: s_mov_b32 s10, s4 -; GFX9-W64-NEXT: s_mov_b32 s11, s4 ; GFX9-W64-NEXT: s_waitcnt vmcnt(0) ; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-W64-NEXT: image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf +; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf ; GFX9-W64-NEXT: .LBB35_4: ; %end ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3] ; GFX9-W64-NEXT: v_mov_b32_e32 v5, 1.0 @@ -2252,21 +2152,13 @@ define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 { ; GFX10-W32-LABEL: test_scc: ; GFX10-W32: ; %bb.0: ; %main_body ; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-W32-NEXT: s_mov_b32 s8, exec_lo +; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1 ; GFX10-W32-NEXT: s_cbranch_scc0 .LBB35_2 ; GFX10-W32-NEXT: ; %bb.1: ; %else ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-W32-NEXT: v_mov_b32_e32 v1, 1 -; GFX10-W32-NEXT: s_mov_b32 s0, 0 -; GFX10-W32-NEXT: s_mov_b32 s1, s0 -; GFX10-W32-NEXT: s_mov_b32 s2, s0 -; GFX10-W32-NEXT: s_mov_b32 s3, s0 -; GFX10-W32-NEXT: s_mov_b32 s4, s0 -; GFX10-W32-NEXT: s_mov_b32 s5, s0 -; GFX10-W32-NEXT: s_mov_b32 s6, s0 -; GFX10-W32-NEXT: s_mov_b32 s7, s0 ; GFX10-W32-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-W32-NEXT: s_cbranch_execz .LBB35_3 ; GFX10-W32-NEXT: s_branch .LBB35_4 @@ -2275,17 +2167,9 @@ define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 { ; GFX10-W32-NEXT: .LBB35_3: ; %if ; GFX10-W32-NEXT: s_waitcnt vmcnt(0) ; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-W32-NEXT: s_mov_b32 s0, 0 -; GFX10-W32-NEXT: s_mov_b32 s1, s0 -; GFX10-W32-NEXT: s_mov_b32 s2, s0 -; GFX10-W32-NEXT: s_mov_b32 s3, s0 -; GFX10-W32-NEXT: s_mov_b32 s4, s0 -; GFX10-W32-NEXT: s_mov_b32 s5, s0 -; GFX10-W32-NEXT: s_mov_b32 s6, s0 -; GFX10-W32-NEXT: s_mov_b32 s7, s0 ; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-W32-NEXT: .LBB35_4: ; %end -; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s8 +; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1 ; GFX10-W32-NEXT: v_mov_b32_e32 v5, 1.0 ; GFX10-W32-NEXT: buffer_store_dword v5, v4, s[0:3], 0 idxen ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)