From fac33a363a7c151ca3f1e04f6ee54fc3f83a5181 Mon Sep 17 00:00:00 2001 From: Kateryna Muts Date: Wed, 20 Nov 2024 08:47:55 +0000 Subject: [PATCH] [AIE2] Fix memory access cycle for TM instructions --- llvm/lib/Target/AIE/AIE2Schedule.td | 4 +- llvm/test/CodeGen/AIE/aie2/brcc.ll | 12 +++--- .../CodeGen/AIE/aie2/end-to-end/Add2D-red.ll | 43 ++++++++++--------- .../AIE/aie2/end-to-end/TanhTemplated-swp.ll | 9 ++-- .../AIE/aie2/hardware-loops/loop-with-call.ll | 8 ++-- .../AIE/aie2/schedule/interblock/memdeps.mir | 3 ++ .../AIE/aie2/schedule/resource/store.mir | 6 +-- llvm/test/CodeGen/AIE/aie2/switch.ll | 4 +- 8 files changed, 47 insertions(+), 42 deletions(-) diff --git a/llvm/lib/Target/AIE/AIE2Schedule.td b/llvm/lib/Target/AIE/AIE2Schedule.td index a5ad1023e4eb..fbfb52c5f190 100644 --- a/llvm/lib/Target/AIE/AIE2Schedule.td +++ b/llvm/lib/Target/AIE/AIE2Schedule.td @@ -594,7 +594,7 @@ MemInstrItinData, SimpleCycle, EmptyCycles<4>, SimpleCycle], [7,1,1], - MemoryCycles<[5]>>, + MemoryCycles<[4]>>, InstrItinData], [1,1,1,1]>, InstrItinData], [1,1,1,1]>, InstrItinData], [1,1,1,1]>, @@ -703,7 +703,7 @@ MemInstrItinData, PrefixCycle, SimpleCycle, SimpleCycle], [1,1], - MemoryCycles<[5]>>, + MemoryCycles<[4]>>, MemInstrItinData, SimpleCycle], diff --git a/llvm/test/CodeGen/AIE/aie2/brcc.ll b/llvm/test/CodeGen/AIE/aie2/brcc.ll index dc6b837e2e09..741cced6acfa 100644 --- a/llvm/test/CodeGen/AIE/aie2/brcc.ll +++ b/llvm/test/CodeGen/AIE/aie2/brcc.ll @@ -121,17 +121,17 @@ define i32 @br_diamond_complex_end(i32 %a, i32 %b, i32 %v, i32* nocapture writ ; CHECK-NEXT: nopa ; nopb ; geu r0, r2, r1 ; CHECK-NEXT: jnz r0, #.LBB3_2 ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: paddb [sp], #32 // Delay Slot 3 -; CHECK-NEXT: st r16, [sp, #-32] // 4-byte Folded Spill Delay Slot 2 -; CHECK-NEXT: st lr, [sp, #-28]; or r16, r3, r3 // 4-byte Folded Spill Delay Slot 1 +; CHECK-NEXT: paddb [sp], #32 // Delay Slot 4 +; CHECK-NEXT: st r16, [sp, #-32] // 4-byte Folded Spill Delay Slot 3 +; CHECK-NEXT: st lr, [sp, #-28] // 4-byte Folded Spill Delay Slot 2 +; CHECK-NEXT: mov r16, r3 // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %if.then ; CHECK-NEXT: nopb ; nopa ; nops ; j #.LBB3_3; nopv ; CHECK-NEXT: nopa ; nopx // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: nop // Delay Slot 2 -; CHECK-NEXT: st r16, [p0, #0] // Delay Slot 1 +; CHECK-NEXT: st r16, [p0, #0] // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB3_2: // %if.else ; CHECK-NEXT: nopb ; nopa ; nops ; jl #foo; nopv diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll index 17dfb2a60671..3ec553074ef4 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll @@ -79,16 +79,16 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm ; ASM-NEXT: lda p7, [p4, #0]; paddb [p5], #-88; mov p4, sp ; ASM-NEXT: lda r12, [p5, #0]; paddb [p4], #-68; mov p5, sp ; ASM-NEXT: lda p4, [p4, #0]; paddb [p5], #-92 -; ASM-NEXT: lda r13, [p5, #0] -; ASM-NEXT: mova r6, #1; add r7, r1, #-1; mov p5, r6 +; ASM-NEXT: lda r13, [p5, #0]; add r7, r1, #-1 +; ASM-NEXT: mova r6, #1; nez r0, r0; mov p5, r6 ; ASM-NEXT: mova r6, #3; ne r3, r3, r6 ; ASM-NEXT: ltu r7, r7, r6 -; ASM-NEXT: jz r7, #.LBB0_2 -; ASM-NEXT: st dn4, [p5, #0]; nez r0, r0 // Delay Slot 5 -; ASM-NEXT: st r0, [p6, #0] // Delay Slot 4 -; ASM-NEXT: paddb [p2], m3; st r5, [p7, #0] // Delay Slot 3 -; ASM-NEXT: padda [p1], m2; paddb [p2], m5; and r8, r1, r6; st r3, [p4, #0] // Delay Slot 2 -; ASM-NEXT: mova r6, #0; paddb [p2], m4; st r8, [p0, #0] // Delay Slot 1 +; ASM-NEXT: st dn4, [p5, #0]; jz r7, #.LBB0_2 +; ASM-NEXT: st r0, [p6, #0] // Delay Slot 5 +; ASM-NEXT: st r5, [p7, #0] // Delay Slot 4 +; ASM-NEXT: st r3, [p4, #0]; paddb [p2], m3; and r8, r1, r6 // Delay Slot 3 +; ASM-NEXT: paddb [p2], m5; st r8, [p0, #0] // Delay Slot 2 +; ASM-NEXT: mova r6, #0; paddb [p2], m4; padds [p1], m2 // Delay Slot 1 ; ASM-NEXT: // %bb.1: ; ASM-NEXT: nopb ; nopa ; nops ; j #.LBB0_6; nopv ; ASM-NEXT: nopa ; nopx // Delay Slot 5 @@ -98,9 +98,9 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm ; ASM-NEXT: mova r0, #0 // Delay Slot 1 ; ASM-NEXT: .p2align 4 ; ASM-NEXT: .LBB0_2: // %entry.new -; ASM-NEXT: vlda.ups.s32.d8 cm1, s1, [p1], m1; mov dc0, #0 +; ASM-NEXT: nopb ; vlda.ups.s32.d8 cm1, s1, [p1], m1; nops ; nopx ; mov dc0, #0; nopv ; ASM-NEXT: vlda.ups.s32.d8 cm2, s1, [p1], m1; mov dc4, dc0 -; ASM-NEXT: vlda.3d.ups.s32.d8 cm4, s1, [p2], d0 +; ASM-NEXT: vlda.3d.ups.s32.d8 cm4, s1, [p2], d0; nopx ; ASM-NEXT: vlda.ups.s32.d8 cm5, s1, [p1], m1 ; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; mov crUPSSign, r4 ; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1; mov s1, r2 @@ -110,16 +110,16 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm ; ASM-NEXT: vlda.3d.ups.s32.d8 cm5, s1, [p2], d0 ; ASM-NEXT: vlda.3d.ups.s32.d8 cm7, s1, [p2], d0 ; ASM-NEXT: vlda.ups.s32.d8 cm4, s1, [p1], m1 -; ASM-NEXT: vlda.ups.s32.d8 cm3, s1, [p1], m1 -; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; vadd cm4, cm4, cm1, r0 -; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1; movx r6, #-4; vadd cm6, cm6, cm2, r0 -; ASM-NEXT: vlda.3d.ups.s32.d8 cm1, s1, [p2], d0; and r1, r1, r6; vadd cm1, cm7, cm5, r0 -; ASM-NEXT: add r1, r1, #-4; mov crSRSSign, r3; vadd cm8, cm3, cm0, r0 +; ASM-NEXT: vlda.ups.s32.d8 cm3, s1, [p1], m1; vadd cm4, cm4, cm1, r0 +; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; vadd cm6, cm6, cm2, r0 +; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1; movx r6, #-4; vadd cm1, cm7, cm5, r0 +; ASM-NEXT: vlda.3d.ups.s32.d8 cm1, s1, [p2], d0; and r1, r1, r6; vadd cm8, cm3, cm0, r0 +; ASM-NEXT: add r1, r1, #-4; mov crSRSSign, r3 ; ASM-NEXT: add r1, r1, #-4; mov s0, r5 -; ASM-NEXT: jz r1, #.LBB0_5 -; ASM-NEXT: vst.srs.d8.s32 cm4, s0, [p3], #32 // Delay Slot 5 -; ASM-NEXT: vst.srs.d8.s32 cm6, s0, [p3], #32 // Delay Slot 4 -; ASM-NEXT: vst.srs.d8.s32 cm1, s0, [p3], #32 // Delay Slot 3 +; ASM-NEXT: vst.srs.d8.s32 cm4, s0, [p3], #32; jz r1, #.LBB0_5 +; ASM-NEXT: vst.srs.d8.s32 cm6, s0, [p3], #32 // Delay Slot 5 +; ASM-NEXT: vst.srs.d8.s32 cm1, s0, [p3], #32 // Delay Slot 4 +; ASM-NEXT: nop // Delay Slot 3 ; ASM-NEXT: nop // Delay Slot 2 ; ASM-NEXT: nop // Delay Slot 1 ; ASM-NEXT: .p2align 4 @@ -150,8 +150,9 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm ; ASM-NEXT: vst.srs.d8.s32 cm8, s0, [p3], #32 ; ASM-NEXT: vst.srs.d8.s32 cm2, s0, [p3], #32 ; ASM-NEXT: vst.srs.d8.s32 cm4, s0, [p3], #32 -; ASM-NEXT: vst.srs.d8.s32 cm3, s0, [p3], #32; mov crUPSSign, #0 -; ASM-NEXT: vst.srs.d8.s32 cm0, s0, [p3], #32; mov r6, dc0 +; ASM-NEXT: vst.srs.d8.s32 cm3, s0, [p3], #32 +; ASM-NEXT: vst.srs.d8.s32 cm0, s0, [p3], #32; mov crUPSSign, #0 +; ASM-NEXT: mov r6, dc0 ; ASM-NEXT: mov r0, dc4 ; ASM-NEXT: mov crSRSSign, #0 ; ASM-NEXT: .p2align 4 diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll index 63dc588fa834..7706b5ed46be 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll @@ -132,7 +132,7 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non ; CHECK-NEXT: .L_LEnd0: ; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl7, bml3; nopx ; vmin_ge.bf16 x5, r16, x5, x1; nopv ; CHECK-NEXT: // %bb.2: -; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmov wh7, wl2; nopv +; CHECK-NEXT: nopa ; nopb ; nopx ; vmov wh7, wl2; nops ; CHECK-NEXT: vconv.bf16.fp32 wl1, bml4; vmov wh1, wl2 ; CHECK-NEXT: vmov wh6, wl2; vmul.f bmh3, x7, x0, r1 ; CHECK-NEXT: vmax_lt.bf16 x10, r16, x5, x10; vmul.f bmh2, x1, x0, r1 @@ -164,10 +164,11 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non ; CHECK-NEXT: vsub.f bmh0, bmh0, bmh1, r0 ; CHECK-NEXT: nop ; CHECK-NEXT: nop +; CHECK-NEXT: nop ; CHECK-NEXT: ret lr -; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: vst.conv.bf16.fp32 bmh2, [p1], #32 // Delay Slot 4 -; CHECK-NEXT: vst.conv.bf16.fp32 bmh0, [p1], #32 // Delay Slot 3 +; CHECK-NEXT: vst.conv.bf16.fp32 bmh2, [p1], #32 // Delay Slot 5 +; CHECK-NEXT: vst.conv.bf16.fp32 bmh0, [p1], #32 // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 ; CHECK-NEXT: nop // Delay Slot 2 ; CHECK-NEXT: mov r16, r8 // Delay Slot 1 for.body.lr.ph: diff --git a/llvm/test/CodeGen/AIE/aie2/hardware-loops/loop-with-call.ll b/llvm/test/CodeGen/AIE/aie2/hardware-loops/loop-with-call.ll index 93763a50c8d6..aa1bfed6b784 100644 --- a/llvm/test/CodeGen/AIE/aie2/hardware-loops/loop-with-call.ll +++ b/llvm/test/CodeGen/AIE/aie2/hardware-loops/loop-with-call.ll @@ -71,10 +71,10 @@ define dso_local void @_Z5test4i(i32 noundef %n) { ; CHECK-NEXT: ge r0, r1, r0; mov r16, r0 ; CHECK-NEXT: jnz r0, #.LBB1_3 ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: nop // Delay Slot 4 -; CHECK-NEXT: st lr, [sp, #-20] // 4-byte Folded Spill Delay Slot 3 -; CHECK-NEXT: st p6, [sp, #-28] // 4-byte Folded Spill Delay Slot 2 -; CHECK-NEXT: st p7, [sp, #-32] // 4-byte Folded Spill Delay Slot 1 +; CHECK-NEXT: st lr, [sp, #-20] // 4-byte Folded Spill Delay Slot 4 +; CHECK-NEXT: st p6, [sp, #-28] // 4-byte Folded Spill Delay Slot 3 +; CHECK-NEXT: st p7, [sp, #-32] // 4-byte Folded Spill Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: nopa ; movxm p6, #.L.str ; CHECK-NEXT: movxm p7, #.L.str.1 diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/interblock/memdeps.mir b/llvm/test/CodeGen/AIE/aie2/schedule/interblock/memdeps.mir index b559107ff40e..ce8089b31921 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/interblock/memdeps.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/interblock/memdeps.mir @@ -34,6 +34,7 @@ body: | ; CHECK-NEXT: VST_SRS_D8_S32_ag_idx_imm killed $p0, 0, killed $cm0, killed $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -95,6 +96,7 @@ body: | ; CHECK-NEXT: liveins: $p0, $cm0, $s0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: VST_SRS_D8_S32_ag_idx_imm killed $p0, 0, killed $cm0, killed $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign + ; CHECK-NEXT: NOP ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -153,6 +155,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: VST_SRS_D8_S32_ag_idx_imm killed $p0, 0, killed $cm0, killed $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) diff --git a/llvm/test/CodeGen/AIE/aie2/schedule/resource/store.mir b/llvm/test/CodeGen/AIE/aie2/schedule/resource/store.mir index 1b3393548c8d..f8fb796bff2f 100644 --- a/llvm/test/CodeGen/AIE/aie2/schedule/resource/store.mir +++ b/llvm/test/CodeGen/AIE/aie2/schedule/resource/store.mir @@ -154,12 +154,12 @@ alignment: 16 body: | bb.0.entry: ; CHECK-LABEL: name: VST_CONV_2D_BF16_FP32_STORE_UNIT - ; CHECK: $p5, $dc2 = VST_2D_SRS_D16_S32 killed $p5, killed $d2, killed $bml0, killed $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<8 x s32>) into stack - 32) - ; CHECK-NEXT: $p2, $dc0 = VST_CONV_2D_BF16_FP32 killed $p2, killed $d0, killed $bml1, implicit-def $srf2fflags, implicit $crrnd, implicit $crf2fmask :: (store (<8 x s32>) into stack - 64) - ; CHECK-NEXT: NOP + ; CHECK: $p2, $dc0 = VST_CONV_2D_BF16_FP32 killed $p2, killed $d0, killed $bml1, implicit-def $srf2fflags, implicit $crrnd, implicit $crf2fmask :: (store (<8 x s32>) into stack - 64) + ; CHECK-NEXT: $p5, $dc2 = VST_2D_SRS_D16_S32 killed $p5, killed $d2, killed $bml0, killed $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<8 x s32>) into stack - 32) ; CHECK-NEXT: $p2, $dc1 = VST_CONV_2D_BF16_FP32 killed $p2, killed $d1, killed $bml2, implicit-def $srf2fflags, implicit $crrnd, implicit $crf2fmask :: (store (<8 x s32>) into stack - 96) ; CHECK-NEXT: NOP ; CHECK-NEXT: NOP + ; CHECK-NEXT: NOP $p5, $dc2 = VST_2D_SRS_D16_S32 $p5, $d2, $bml0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<8 x s32>) into stack - 32) $p2, $dc0 = VST_CONV_2D_BF16_FP32 $p2, $d0, $bml1, implicit-def $srf2fflags, implicit $crrnd, implicit $crf2fmask :: (store (<8 x s32>) into stack - 64) $p2, $dc1 = VST_CONV_2D_BF16_FP32 $p2, $d1, $bml2, implicit-def $srf2fflags, implicit $crrnd, implicit $crf2fmask :: (store (<8 x s32>) into stack - 96) diff --git a/llvm/test/CodeGen/AIE/aie2/switch.ll b/llvm/test/CodeGen/AIE/aie2/switch.ll index 3d52a08afda9..7911a317a32f 100644 --- a/llvm/test/CodeGen/AIE/aie2/switch.ll +++ b/llvm/test/CodeGen/AIE/aie2/switch.ll @@ -37,8 +37,8 @@ define i32 @test(i8 signext %i) noinline nounwind optnone { ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: nop // Delay Slot 2 -; CHECK-NEXT: st lr, [sp, #-28] // 4-byte Folded Spill Delay Slot 1 +; CHECK-NEXT: st lr, [sp, #-28] // 4-byte Folded Spill Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: nopa ; nopb ; movxm p0, #.LJTI0_0 ; CHECK-NEXT: movxm r1, #1048575