Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AIE2] Fix memory access cycle for TM instructions #240

Draft
wants to merge 1 commit into
base: aie-public
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions llvm/lib/Target/AIE/AIE2Schedule.td
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,7 @@ MemInstrItinData<II_LDA_TM,
[SimpleCycle<PROC_BUS>, SimpleCycle<LOAD_UNIT_A>, EmptyCycles<4>,
SimpleCycle<R_WA_PORT>],
[7,1,1],
MemoryCycles<[5]>>,
MemoryCycles<[4]>>,
InstrItinData<II_LSHL, [InstrStage<1, [R_WX_PORT]>], [1,1,1,1]>,
InstrItinData<II_LT, [InstrStage<1, [R_WX_PORT]>], [1,1,1,1]>,
InstrItinData<II_LTU, [InstrStage<1, [R_WX_PORT]>], [1,1,1,1]>,
Expand Down Expand Up @@ -703,7 +703,7 @@ MemInstrItinData<II_ST_TM,
[PrefixCycle<PROC_BUS>, PrefixCycle<R_RV_PORT>,
SimpleCycle<P_RM_PORT>, SimpleCycle<STORE_UNIT>],
[1,1],
MemoryCycles<[5]>>,
MemoryCycles<[4]>>,
MemInstrItinData<II_ST_Q,
[AvoidPartWordStore, EmptyCycles<1>,
SimpleCycle<STORE_UNIT>],
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AIE/aie2/brcc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -121,17 +121,17 @@ define i32 @br_diamond_complex_end(i32 %a, i32 %b, i32 %v, i32* nocapture writ
; CHECK-NEXT: nopa ; nopb ; geu r0, r2, r1
; CHECK-NEXT: jnz r0, #.LBB3_2
; CHECK-NEXT: nop // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: paddb [sp], #32 // Delay Slot 3
; CHECK-NEXT: st r16, [sp, #-32] // 4-byte Folded Spill Delay Slot 2
; CHECK-NEXT: st lr, [sp, #-28]; or r16, r3, r3 // 4-byte Folded Spill Delay Slot 1
; CHECK-NEXT: paddb [sp], #32 // Delay Slot 4
; CHECK-NEXT: st r16, [sp, #-32] // 4-byte Folded Spill Delay Slot 3
; CHECK-NEXT: st lr, [sp, #-28] // 4-byte Folded Spill Delay Slot 2
; CHECK-NEXT: mov r16, r3 // Delay Slot 1
; CHECK-NEXT: // %bb.1: // %if.then
; CHECK-NEXT: nopb ; nopa ; nops ; j #.LBB3_3; nopv
; CHECK-NEXT: nopa ; nopx // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: st r16, [p0, #0] // Delay Slot 1
; CHECK-NEXT: st r16, [p0, #0] // Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB3_2: // %if.else
; CHECK-NEXT: nopb ; nopa ; nops ; jl #foo; nopv
Expand Down
43 changes: 22 additions & 21 deletions llvm/test/CodeGen/AIE/aie2/end-to-end/Add2D-red.ll
Original file line number Diff line number Diff line change
Expand Up @@ -79,16 +79,16 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm
; ASM-NEXT: lda p7, [p4, #0]; paddb [p5], #-88; mov p4, sp
; ASM-NEXT: lda r12, [p5, #0]; paddb [p4], #-68; mov p5, sp
; ASM-NEXT: lda p4, [p4, #0]; paddb [p5], #-92
; ASM-NEXT: lda r13, [p5, #0]
; ASM-NEXT: mova r6, #1; add r7, r1, #-1; mov p5, r6
; ASM-NEXT: lda r13, [p5, #0]; add r7, r1, #-1
; ASM-NEXT: mova r6, #1; nez r0, r0; mov p5, r6
; ASM-NEXT: mova r6, #3; ne r3, r3, r6
; ASM-NEXT: ltu r7, r7, r6
; ASM-NEXT: jz r7, #.LBB0_2
; ASM-NEXT: st dn4, [p5, #0]; nez r0, r0 // Delay Slot 5
; ASM-NEXT: st r0, [p6, #0] // Delay Slot 4
; ASM-NEXT: paddb [p2], m3; st r5, [p7, #0] // Delay Slot 3
; ASM-NEXT: padda [p1], m2; paddb [p2], m5; and r8, r1, r6; st r3, [p4, #0] // Delay Slot 2
; ASM-NEXT: mova r6, #0; paddb [p2], m4; st r8, [p0, #0] // Delay Slot 1
; ASM-NEXT: st dn4, [p5, #0]; jz r7, #.LBB0_2
; ASM-NEXT: st r0, [p6, #0] // Delay Slot 5
; ASM-NEXT: st r5, [p7, #0] // Delay Slot 4
; ASM-NEXT: st r3, [p4, #0]; paddb [p2], m3; and r8, r1, r6 // Delay Slot 3
; ASM-NEXT: paddb [p2], m5; st r8, [p0, #0] // Delay Slot 2
; ASM-NEXT: mova r6, #0; paddb [p2], m4; padds [p1], m2 // Delay Slot 1
; ASM-NEXT: // %bb.1:
; ASM-NEXT: nopb ; nopa ; nops ; j #.LBB0_6; nopv
; ASM-NEXT: nopa ; nopx // Delay Slot 5
Expand All @@ -98,9 +98,9 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm
; ASM-NEXT: mova r0, #0 // Delay Slot 1
; ASM-NEXT: .p2align 4
; ASM-NEXT: .LBB0_2: // %entry.new
; ASM-NEXT: vlda.ups.s32.d8 cm1, s1, [p1], m1; mov dc0, #0
; ASM-NEXT: nopb ; vlda.ups.s32.d8 cm1, s1, [p1], m1; nops ; nopx ; mov dc0, #0; nopv
; ASM-NEXT: vlda.ups.s32.d8 cm2, s1, [p1], m1; mov dc4, dc0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm4, s1, [p2], d0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm4, s1, [p2], d0; nopx
; ASM-NEXT: vlda.ups.s32.d8 cm5, s1, [p1], m1
; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; mov crUPSSign, r4
; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1; mov s1, r2
Expand All @@ -110,16 +110,16 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm
; ASM-NEXT: vlda.3d.ups.s32.d8 cm5, s1, [p2], d0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm7, s1, [p2], d0
; ASM-NEXT: vlda.ups.s32.d8 cm4, s1, [p1], m1
; ASM-NEXT: vlda.ups.s32.d8 cm3, s1, [p1], m1
; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; vadd cm4, cm4, cm1, r0
; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1; movx r6, #-4; vadd cm6, cm6, cm2, r0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm1, s1, [p2], d0; and r1, r1, r6; vadd cm1, cm7, cm5, r0
; ASM-NEXT: add r1, r1, #-4; mov crSRSSign, r3; vadd cm8, cm3, cm0, r0
; ASM-NEXT: vlda.ups.s32.d8 cm3, s1, [p1], m1; vadd cm4, cm4, cm1, r0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm6, s1, [p2], d0; vadd cm6, cm6, cm2, r0
; ASM-NEXT: vlda.ups.s32.d8 cm0, s1, [p1], m1; movx r6, #-4; vadd cm1, cm7, cm5, r0
; ASM-NEXT: vlda.3d.ups.s32.d8 cm1, s1, [p2], d0; and r1, r1, r6; vadd cm8, cm3, cm0, r0
; ASM-NEXT: add r1, r1, #-4; mov crSRSSign, r3
; ASM-NEXT: add r1, r1, #-4; mov s0, r5
; ASM-NEXT: jz r1, #.LBB0_5
; ASM-NEXT: vst.srs.d8.s32 cm4, s0, [p3], #32 // Delay Slot 5
; ASM-NEXT: vst.srs.d8.s32 cm6, s0, [p3], #32 // Delay Slot 4
; ASM-NEXT: vst.srs.d8.s32 cm1, s0, [p3], #32 // Delay Slot 3
; ASM-NEXT: vst.srs.d8.s32 cm4, s0, [p3], #32; jz r1, #.LBB0_5
; ASM-NEXT: vst.srs.d8.s32 cm6, s0, [p3], #32 // Delay Slot 5
; ASM-NEXT: vst.srs.d8.s32 cm1, s0, [p3], #32 // Delay Slot 4
; ASM-NEXT: nop // Delay Slot 3
; ASM-NEXT: nop // Delay Slot 2
; ASM-NEXT: nop // Delay Slot 1
; ASM-NEXT: .p2align 4
Expand Down Expand Up @@ -150,8 +150,9 @@ define void @add2d(ptr noalias %params, ptr noalias %ifm1_data, ptr noalias %ifm
; ASM-NEXT: vst.srs.d8.s32 cm8, s0, [p3], #32
; ASM-NEXT: vst.srs.d8.s32 cm2, s0, [p3], #32
; ASM-NEXT: vst.srs.d8.s32 cm4, s0, [p3], #32
; ASM-NEXT: vst.srs.d8.s32 cm3, s0, [p3], #32; mov crUPSSign, #0
; ASM-NEXT: vst.srs.d8.s32 cm0, s0, [p3], #32; mov r6, dc0
; ASM-NEXT: vst.srs.d8.s32 cm3, s0, [p3], #32
; ASM-NEXT: vst.srs.d8.s32 cm0, s0, [p3], #32; mov crUPSSign, #0
; ASM-NEXT: mov r6, dc0
; ASM-NEXT: mov r0, dc4
; ASM-NEXT: mov crSRSSign, #0
; ASM-NEXT: .p2align 4
Expand Down
9 changes: 5 additions & 4 deletions llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non
; CHECK-NEXT: .L_LEnd0:
; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl7, bml3; nopx ; vmin_ge.bf16 x5, r16, x5, x1; nopv
; CHECK-NEXT: // %bb.2:
; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmov wh7, wl2; nopv
; CHECK-NEXT: nopa ; nopb ; nopx ; vmov wh7, wl2; nops
; CHECK-NEXT: vconv.bf16.fp32 wl1, bml4; vmov wh1, wl2
; CHECK-NEXT: vmov wh6, wl2; vmul.f bmh3, x7, x0, r1
; CHECK-NEXT: vmax_lt.bf16 x10, r16, x5, x10; vmul.f bmh2, x1, x0, r1
Expand Down Expand Up @@ -164,10 +164,11 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non
; CHECK-NEXT: vsub.f bmh0, bmh0, bmh1, r0
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: nop
; CHECK-NEXT: ret lr
; CHECK-NEXT: nop // Delay Slot 5
; CHECK-NEXT: vst.conv.bf16.fp32 bmh2, [p1], #32 // Delay Slot 4
; CHECK-NEXT: vst.conv.bf16.fp32 bmh0, [p1], #32 // Delay Slot 3
; CHECK-NEXT: vst.conv.bf16.fp32 bmh2, [p1], #32 // Delay Slot 5
; CHECK-NEXT: vst.conv.bf16.fp32 bmh0, [p1], #32 // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: mov r16, r8 // Delay Slot 1
for.body.lr.ph:
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AIE/aie2/hardware-loops/loop-with-call.ll
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,10 @@ define dso_local void @_Z5test4i(i32 noundef %n) {
; CHECK-NEXT: ge r0, r1, r0; mov r16, r0
; CHECK-NEXT: jnz r0, #.LBB1_3
; CHECK-NEXT: nop // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: st lr, [sp, #-20] // 4-byte Folded Spill Delay Slot 3
; CHECK-NEXT: st p6, [sp, #-28] // 4-byte Folded Spill Delay Slot 2
; CHECK-NEXT: st p7, [sp, #-32] // 4-byte Folded Spill Delay Slot 1
; CHECK-NEXT: st lr, [sp, #-20] // 4-byte Folded Spill Delay Slot 4
; CHECK-NEXT: st p6, [sp, #-28] // 4-byte Folded Spill Delay Slot 3
; CHECK-NEXT: st p7, [sp, #-32] // 4-byte Folded Spill Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: nopa ; movxm p6, #.L.str
; CHECK-NEXT: movxm p7, #.L.str.1
Expand Down
3 changes: 3 additions & 0 deletions llvm/test/CodeGen/AIE/aie2/schedule/interblock/memdeps.mir
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ body: |
; CHECK-NEXT: VST_SRS_D8_S32_ag_idx_imm killed $p0, 0, killed $cm0, killed $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign
; CHECK-NEXT: NOP
; CHECK-NEXT: NOP
; CHECK-NEXT: NOP
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As discussed offline, that extra might hurt QoR. I'd be happy to measure that impact.

If the results aren't good, we could extend

  int getMinFirstMemoryCycle() const;
  int getMaxFirstMemoryCycle() const;
  int getMinLastMemoryCycle() const;
  int getMaxLastMemoryCycle() const;

into e.g.

  int getMinFirstMemoryCycle(unsigned AddrSpace = 0) const;
  int getMaxFirstMemoryCycle(unsigned AddrSpace = 0) const;
  int getMinLastMemoryCycle(unsigned AddrSpace = 0) const;
  int getMaxLastMemoryCycle(unsigned AddrSpace = 0) const;

Taking the example of AIE2:

  • getMinFirstMemoryCycle(AIE2::AddressSpaces::none) should return 4
  • getMinFirstMemoryCycle(AIE2::AddressSpaces::DM) should return 5
  • getMinFirstMemoryCycle(AIE2::AddressSpaces::TM) should return 4

But then I guess we'll have the same problem because many load/stores will have the default none address space, even though they actually are in DM. That would be quite a big change if we also need to infer the AddressSpaces::DM address space.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you think it will be a big change if we need to infer none to DM ?

Note : access to tile memory is always annotated at intrinsic level so we will always infer that

; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
Expand Down Expand Up @@ -95,6 +96,7 @@ body: |
; CHECK-NEXT: liveins: $p0, $cm0, $s0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: VST_SRS_D8_S32_ag_idx_imm killed $p0, 0, killed $cm0, killed $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign
; CHECK-NEXT: NOP
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
Expand Down Expand Up @@ -153,6 +155,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: VST_SRS_D8_S32_ag_idx_imm killed $p0, 0, killed $cm0, killed $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign
; CHECK-NEXT: NOP
; CHECK-NEXT: NOP
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AIE/aie2/schedule/resource/store.mir
Original file line number Diff line number Diff line change
Expand Up @@ -154,12 +154,12 @@ alignment: 16
body: |
bb.0.entry:
; CHECK-LABEL: name: VST_CONV_2D_BF16_FP32_STORE_UNIT
; CHECK: $p5, $dc2 = VST_2D_SRS_D16_S32 killed $p5, killed $d2, killed $bml0, killed $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<8 x s32>) into stack - 32)
; CHECK-NEXT: $p2, $dc0 = VST_CONV_2D_BF16_FP32 killed $p2, killed $d0, killed $bml1, implicit-def $srf2fflags, implicit $crrnd, implicit $crf2fmask :: (store (<8 x s32>) into stack - 64)
; CHECK-NEXT: NOP
; CHECK: $p2, $dc0 = VST_CONV_2D_BF16_FP32 killed $p2, killed $d0, killed $bml1, implicit-def $srf2fflags, implicit $crrnd, implicit $crf2fmask :: (store (<8 x s32>) into stack - 64)
; CHECK-NEXT: $p5, $dc2 = VST_2D_SRS_D16_S32 killed $p5, killed $d2, killed $bml0, killed $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<8 x s32>) into stack - 32)
; CHECK-NEXT: $p2, $dc1 = VST_CONV_2D_BF16_FP32 killed $p2, killed $d1, killed $bml2, implicit-def $srf2fflags, implicit $crrnd, implicit $crf2fmask :: (store (<8 x s32>) into stack - 96)
; CHECK-NEXT: NOP
; CHECK-NEXT: NOP
; CHECK-NEXT: NOP
$p5, $dc2 = VST_2D_SRS_D16_S32 $p5, $d2, $bml0, $s0, implicit-def $srsrs_of, implicit $crsat, implicit $crrnd, implicit $crsrssign :: (store (<8 x s32>) into stack - 32)
$p2, $dc0 = VST_CONV_2D_BF16_FP32 $p2, $d0, $bml1, implicit-def $srf2fflags, implicit $crrnd, implicit $crf2fmask :: (store (<8 x s32>) into stack - 64)
$p2, $dc1 = VST_CONV_2D_BF16_FP32 $p2, $d1, $bml2, implicit-def $srf2fflags, implicit $crrnd, implicit $crf2fmask :: (store (<8 x s32>) into stack - 96)
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AIE/aie2/switch.ll
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ define i32 @test(i8 signext %i) noinline nounwind optnone {
; CHECK-NEXT: nop // Delay Slot 5
; CHECK-NEXT: nop // Delay Slot 4
; CHECK-NEXT: nop // Delay Slot 3
; CHECK-NEXT: nop // Delay Slot 2
; CHECK-NEXT: st lr, [sp, #-28] // 4-byte Folded Spill Delay Slot 1
; CHECK-NEXT: st lr, [sp, #-28] // 4-byte Folded Spill Delay Slot 2
; CHECK-NEXT: nop // Delay Slot 1
; CHECK-NEXT: // %bb.1: // %entry
; CHECK-NEXT: nopa ; nopb ; movxm p0, #.LJTI0_0
; CHECK-NEXT: movxm r1, #1048575
Expand Down
Loading