Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Arm64: Generate conditional comparison and selection instructions #55364

Closed
3 tasks
Tracked by #77010
echesakov opened this issue Jul 8, 2021 · 11 comments
Closed
3 tasks
Tracked by #77010

Arm64: Generate conditional comparison and selection instructions #55364

echesakov opened this issue Jul 8, 2021 · 11 comments
Assignees
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI Bottom Up Work Not part of a theme, epic, or user story
Milestone

Comments

@echesakov
Copy link
Contributor

echesakov commented Jul 8, 2021

Arm64 provides branchless conditional selection and comparison instructions that should be utilized by RyuJIT in the code it generates.

image

Reference: https://eclecticlight.co/2021/07/20/code-in-arm-assembly-conditions-without-branches/

RyuJIT already has support for them as seen below:

INST1(csel, "csel", 0, IF_DR_3D, 0x1A800000)
// csel Rd,Rn,Rm,cond DR_3D X0011010100mmmmm cccc00nnnnnddddd 1A80 0000 cond
INST1(csinc, "csinc", 0, IF_DR_3D, 0x1A800400)
// csinc Rd,Rn,Rm,cond DR_3D X0011010100mmmmm cccc01nnnnnddddd 1A80 0400 cond
INST1(csinv, "csinv", 0, IF_DR_3D, 0x5A800000)
// csinv Rd,Rn,Rm,cond DR_3D X1011010100mmmmm cccc00nnnnnddddd 5A80 0000 cond
INST1(csneg, "csneg", 0, IF_DR_3D, 0x5A800400)
// csneg Rd,Rn,Rm,cond DR_3D X1011010100mmmmm cccc01nnnnnddddd 5A80 0400 cond
INST1(cinc, "cinc", 0, IF_DR_2D, 0x1A800400)
// cinc Rd,Rn,cond DR_2D X0011010100nnnnn cccc01nnnnnddddd 1A80 0400 cond
INST1(cinv, "cinv", 0, IF_DR_2D, 0x5A800000)
// cinv Rd,Rn,cond DR_2D X1011010100nnnnn cccc00nnnnnddddd 5A80 0000 cond
INST1(cneg, "cneg", 0, IF_DR_2D, 0x5A800400)
// cneg Rd,Rn,cond DR_2D X1011010100nnnnn cccc01nnnnnddddd 5A80 0400 cond
INST1(cset, "cset", 0, IF_DR_1D, 0x1A9F07E0)
// cset Rd,cond DR_1D X001101010011111 cccc0111111ddddd 1A9F 07E0 Rd cond

INST2(ccmp, "ccmp", CMP, IF_EN2F, 0x7A400000, 0x7A400800)
// ccmp Rn,Rm, nzcv,cond DR_2I X1111010010mmmmm cccc00nnnnn0nzcv 7A40 0000 nzcv, cond
// ccmp Rn,imm5,nzcv,cond DI_1F X1111010010iiiii cccc10nnnnn0nzcv 7A40 0800 imm5, nzcv, cond
INST2(ccmn, "ccmn", CMP, IF_EN2F, 0x3A400000, 0x3A400800)
// ccmn Rn,Rm, nzcv,cond DR_2I X0111010010mmmmm cccc00nnnnn0nzcv 3A40 0000 nzcv, cond
// ccmn Rn,imm5,nzcv,cond DI_1F X0111010910iiiii cccc10nnnnn0nzcv 3A40 0800 imm5, nzcv, cond

Currently, the method emitIns_R_R_R_COND and emitIns_R_I_FLAGS_COND that produces these instructions are not utilized at all. emitIns_R_R_R_COND was recently used in #66407 to generate csneg instruction. Once these instructions are used, we could produce much better code. Below are some examples:

Example# 1:

    static void Test(uint op1, uint op2) {
        if (op1 > 0 && op2 > 0) {
            op1 = 5;
        }  else {
            op1 = 10;
        }
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/5ov9TKx6P
Current code:

G_M2878_IG01:
            stp     fp, lr, [sp,#-16]!
            mov     fp, sp
                                                ;; bbWeight=1    PerfScore 1.50
G_M2878_IG02:
            cbz     w0, G_M2878_IG04
                                                ;; bbWeight=1    PerfScore 1.00
G_M2878_IG03:
            cbz     w1, G_M2878_IG04
            mov     w0, #5
            b       G_M2878_IG05
                                                ;; bbWeight=0.50 PerfScore 1.25
G_M2878_IG04:
            mov     w0, #10
                                                ;; bbWeight=0.50 PerfScore 0.25
G_M2878_IG05:
            bl      _12219:Consume(int,int)
                                                ;; bbWeight=1    PerfScore 1.00
G_M2878_IG06:
            ldp     fp, lr, [sp],#16
            ret     lr
                                                ;; bbWeight=1    PerfScore 2.00

Example# 2:

    static void Test(uint op1, uint op2) {
        op1 = op1 > 0 ? 5 : 6;
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/GTnc4jjfG
Current code:

G_M9565_IG01:
            stp     fp, lr, [sp,#-16]!
            mov     fp, sp
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG02:
            cmp     w0, #0
            bgt     G_M9565_IG04
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG03:
            mov     w0, #6
            b       G_M9565_IG05
                                                ;; bbWeight=0.50 PerfScore 0.75
G_M9565_IG04:
            mov     w0, #5
                                                ;; bbWeight=0.50 PerfScore 0.25
G_M9565_IG05:
            bl      _12219:Consume(int,int)
                                                ;; bbWeight=1    PerfScore 1.00
G_M9565_IG06:
            ldp     fp, lr, [sp],#16
            ret     lr
                                                ;; bbWeight=1    PerfScore 2.00

Example# 3:

    static void Test(uint op1, uint op2) {
        op1 = (op1 > 0) ? 0 : 1;
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/GoqcsM1Tf
Current code:

G_M9565_IG01:
            stp     fp, lr, [sp,#-16]!
            mov     fp, sp
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG02:
            cmp     w0, #0
            bgt     G_M9565_IG04
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG03:
            mov     w0, #1
            b       G_M9565_IG05
                                                ;; bbWeight=0.50 PerfScore 0.75
G_M9565_IG04:
            mov     w0, wzr
                                                ;; bbWeight=0.50 PerfScore 0.25
G_M9565_IG05:
            bl      _12219:Consume(int,int)
                                                ;; bbWeight=1    PerfScore 1.00
G_M9565_IG06:
            ldp     fp, lr, [sp],#16
            ret     lr
                                                ;; bbWeight=1    PerfScore 2.00

Example# 4:

    static void Test(uint op1, uint op2, uint xyz, uint def) {
        op1 = op1 > 0 ? xyz : def;
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/1EfxPn48q
Current code:

G_M9565_IG01:
            stp     fp, lr, [sp,#-16]!
            mov     fp, sp
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG02:
            cbnz    w0, G_M9565_IG04
                                                ;; bbWeight=1    PerfScore 1.00
G_M9565_IG03:
            b       G_M9565_IG05
                                                ;; bbWeight=0.50 PerfScore 0.50
G_M9565_IG04:
            mov     w3, w2
                                                ;; bbWeight=0.50 PerfScore 0.25
G_M9565_IG05:
            mov     w0, w3
            bl      _12219:Consume(int,int)
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG06:
            ldp     fp, lr, [sp],#16
            ret     lr
                                                ;; bbWeight=1    PerfScore 2.00

Example# 5:

    static void Test(int op1, int op2, int xyz, int def) {
        op1 = ((op1 & op2) == 0) ? 5 : def;
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/fc3eddPx3
Current code:

G_M9565_IG01:
            stp     fp, lr, [sp,#-16]!
            mov     fp, sp
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG02:
            tst     w0, w1
            beq     G_M9565_IG04
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG03:
            b       G_M9565_IG05
                                                ;; bbWeight=0.50 PerfScore 0.50
G_M9565_IG04:
            mov     w3, #5
                                                ;; bbWeight=0.50 PerfScore 0.25
G_M9565_IG05:
            mov     w0, w3
            bl      _12219:Consume(int,int)
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG06:
            ldp     fp, lr, [sp],#16
            ret     lr
                                                ;; bbWeight=1    PerfScore 2.00

@TamarChristinaArm

Some related issues:

Presumably, some parts of the analysis can be implemented in platform agnostic way and benefit both Arm64 and X86 platforms.

category:cq
theme:intrinsics
skill-level:expert
cost:large
impact:medium

@echesakov echesakov added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Jul 8, 2021
@echesakov echesakov added this to the Future milestone Jul 8, 2021
@dotnet-issue-labeler dotnet-issue-labeler bot added the untriaged New issue has not been triaged by the area owner label Jul 8, 2021
@echesakov echesakov removed the untriaged New issue has not been triaged by the area owner label Jul 8, 2021
@JulieLeeMSFT JulieLeeMSFT added the User Story A single user-facing feature. Can be grouped under an epic. label Jul 8, 2021
@terrajobst terrajobst added the Bottom Up Work Not part of a theme, epic, or user story label Jul 23, 2021
@echesakov echesakov modified the milestones: Future, 7.0.0 Feb 5, 2022
@a74nh
Copy link
Contributor

a74nh commented Mar 16, 2022

I've been taking a look at this.

Starting with:

        static void TransformsIntoCondBr(uint op1, uint op2) {
            if ((op1 & op2) == 0) {
                op1 = op2;
            } else {
                op2 = op1;
            }
            Console.WriteLine("{0}, {1}", op1, op2);
        }

Gives the CIL:

STMT00000 ( 0x000[E-] ... ??? )
               [000005] ------------              *  JTRUE     void  
               [000004] ------------              \--*  NE        int   
               [000002] ------------                 +--*  AND       int   
               [000000] ------------                 |  +--*  LCL_VAR   int    V00 arg0         
               [000001] ------------                 |  \--*  LCL_VAR   int    V01 arg1         
               [000003] ------------                 \--*  CNS_INT   int    0

Which becomes:

                                                  /--*  t2     int    
                                                  +--*  t3     int    
N005 (  9,  8) [000004] J------N----         t4 = *  NE        int   
                                                  /--*  t4     int    
N006 ( 11, 10) [000005] ------------              *  JTRUE     void  

When running tier1
It uses the code in Lowering::OptimizeConstCompare() to change NE to TEST_NE, and generates optimal code:

                                                              /--*  t0     int    
                                                              +--*  t1     int    
Generating: N009 (  5,  6) [000004] J------N----              *  TEST_NE   void   REG NA $101
IN0001:                           tst     w19, w20
Generating: N011 (  7,  8) [000005] ------------              *  JTRUE     void   REG NA $VN.Void
IN0002:                           bne     (LARGEJMP)L_M63148_BB03

When running tier0
Does not call into OptimizeConstCompare, due to MinOpts check failing.
Instead, LowerJTrue() will get called, creating a JCMP node, which generates:

                                                              /--*  t0     int    
                                                              +--*  t1     int    
Generating: N008 (  7,  5) [000002] ------------         t2 = *  AND       int    REG x0
IN0003:                           and     w0, w0, w1
Generating: N010 (  1,  2) [000003] -c----------         t3 =    CNS_INT   int    0 REG NA
                                                              /--*  t2     int    
                                                              +--*  t3     int    
Generating: N012 (  9,  8) [000004] CNE-------N----              *  JCMP      void   REG NA
IN0004:                           cbnz    (LARGEJMP)L_M63148_BB03

Delete LowerJTrue(), then run tier0
There is no lowering, and everything emits as we'd expect:

                                                              /--*  t0     int    
                                                              +--*  t1     int    
Generating: N008 (  7,  5) [000002] ------------         t2 = *  AND       int    REG x0
IN0003:                           and     w0, w0, w1
Generating: N010 (  1,  2) [000003] -c----------         t3 =    CNS_INT   int    0 REG NA
                                                              /--*  t2     int    
                                                              +--*  t3     int    
Generating: N012 (  9,  8) [000004] J------N----              *  NE        void   REG NA
IN0004:                           cmp     w0, #0
Generating: N014 ( 11, 10) [000005] ------------              *  JTRUE     void   REG NA
IN0005:                           bne     (LARGEJMP)L_M63148_BB03

At first glance, this looks correct. However, the LowerJTrue feels wrong.
OptimizeConstCompare is skipped on tier0 due to the cost of generating the nodes.
Why isn't LowerJTrue skipped on tier0 too? As far as I can tell, the cost of calling LowerJTrue is similar to OptimizeConstCompare?

Would it make sense to

  1. Enable the single optimisation in OptimizeConstCompare on tier0, and remove LowerJTrue.
    This gives us optimal code in all cases
    or
  2. Remove LowerJTrue.
    This simplifies tier0 at the cost of worse code

There's probably some subtleties I'm missing (I'm not sure if OptimizeConstCompare catches all the cases LowerJTrue does). And I've not run any performance testing on any of the above.

If neither the above hold, then this issue can be closed?

@kunalspathak kunalspathak changed the title Conditional instructions/branch elimination Arm64: Generate conditional comparison and selection instructions Mar 16, 2022
@kunalspathak
Copy link
Member

When running tier1
It uses the code in Lowering::OptimizeConstCompare() to change NE to TEST_NE, and generates optimal code:

That's correct, we do generate optimized code, so I have updated the PR description to reflect the current problem.

G_M9565_IG01:
            stp     fp, lr, [sp,#-16]!
            mov     fp, sp
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG02:
            tst     w0, w1
            bne     G_M9565_IG04
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG03:
            mov     w0, w1
            b       G_M9565_IG05
                                                ;; bbWeight=0.50 PerfScore 0.75
G_M9565_IG04:
            mov     w1, w0
                                                ;; bbWeight=0.50 PerfScore 0.25
G_M9565_IG05:
            bl      _12219:Consume(int,int)
                                                ;; bbWeight=1    PerfScore 1.00
G_M9565_IG06:
            ldp     fp, lr, [sp],#16
            ret     lr
                                                ;; bbWeight=1    PerfScore 2.00

Enable the single optimisation in OptimizeConstCompare on tier0, and remove LowerJTrue

We try to do minimum optimization in tier0 for speedup and simplicity, so we mostly focus on improving the tier1 code.

@a74nh
Copy link
Contributor

a74nh commented Mar 18, 2022

Ok, so trying to break this down into a first step:

if (op1 > 0) {
  op1 = 5;
}

Ends up as two basic blocks:

G_M46575_IG02:   
IN0001: 000008                    cmp     w0, #0
IN0002: 00000C                    beq     G_M46575_IG04
G_M46575_IG03: 
IN0003: 000010                    mov     w0, #5

That needs optimising optimising to:

G_M46575_IG02
cmp     w0, #0
mov     w5, #5
csel    w0, w5, w0, ne
G_M46575_IG03

If done correctly, then that should hopefully get rid of all the branches in all the examples above, which should get a large portion of the performance. We can then look at generating the other conditionals and combining instructions in the other examples.

@a74nh
Copy link
Contributor

a74nh commented Apr 8, 2022

I've been speaking to various people within Arm, and the benefit for switching to using csel (and friends) for AArch64 isn't obvious. This is due to modern branch prediction. Branches are predicted many cycles before the condition is evaluated (and before the branch itself is even fetched), so if the prediction is accurate, it results in significant speedups when using branches. In addition, dependency chains on the csel, especially when the result of the csel is required in the next iteration, can significantly slow down csel compared to using branches.

Note that GCC and LLVM make cost based choices on when to use csel. LLVM is considering changing their approach too ( https://discourse.llvm.org/t/rfc-cmov-vs-branch-optimization/6040 ). Of course for a jit, we need the cost of generating the costs to be lightweight.

The current advice is:

  • for cases outside of a loop, use csel.
  • for cases inside a loop, use branches
    Once the above is implemented, then consider adding a cost based approach for using csel inside loops.

The performance impact of the above is likely to be small.

However, every use of csel will reduce code size. AIUI, this is a concern for .Net, so where performance between the two options is the same, then csel should be preferred.

AIUI, X86 has a similar behaviour, but I'm not sure how close.

@TamarChristinaArm for reference

@JulieLeeMSFT
Copy link
Member

#67894

@ghost ghost added the in-pr There is an active PR which will close this issue when it is merged label Jun 6, 2022
@ghost ghost removed the in-pr There is an active PR which will close this issue when it is merged label Aug 1, 2022
@JulieLeeMSFT JulieLeeMSFT modified the milestones: 7.0.0, 8.0.0 Aug 11, 2022
@JulieLeeMSFT
Copy link
Member

Moved to .NET 8 to finish the remaining work.

@JulieLeeMSFT JulieLeeMSFT removed the User Story A single user-facing feature. Can be grouped under an epic. label Oct 14, 2022
@a74nh
Copy link
Contributor

a74nh commented Dec 14, 2022

With the merging of #77728, some of the examples are now looking much better. There are still missing bits.

Example# 1:

    static void Test(uint op1, uint op2) {
        if (op1 > 0 && op2 > 0) {
            op1 = 5;
        }  else {
            op1 = 10;
        }
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/5ov9TKx6P

The op2 check should generate to a CMP and CSEL.
The op1 check will still be a CMP and branch.

This should be fully fixed by #79283

It won't catch when the else case is a different target to the if case, eg:

if (op1 > 0 && op2 > 0) { op1 = 5; } else { op2 = 10; }

Example# 2:

    static void Test(uint op1, uint op2) {
        op1 = op1 > 0 ? 5 : 6;
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/GTnc4jjfG Current code:

This now will do:

MOV x1, 5
MOV x2, 6
CMP x1, 0
CSEL x1, x1, x2, gt

To make this ideal, we'd have to detect the 6 is 1 greater than the 5:

MOV x1, 5
CMP x1, 0
CINC x1, x1, gt

Should be a fairly straightforward to do via lowering/containing.

Example# 3:

    static void Test(uint op1, uint op2) {
        op1 = (op1 > 0) ? 0 : 1;
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/GoqcsM1Tf

We'll generate:

mov     w2, #1
cmp     w0, #0
csel    w0, wzr, w2, ne

Switching the CSEL to a CSET would get rid of the MOV. Suspect this would need changes in If Conversion pass

Example# 4:

    static void Test(uint op1, uint op2, uint xyz, uint def) {
        op1 = op1 > 0 ? xyz : def;
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/1EfxPn48q

This is ideal now. CMP followed by CSEL.

Example# 5:

    static void Test(int op1, int op2, int xyz, int def) {
        op1 = ((op1 & op2) == 0) ? 5 : def;
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/fc3eddPx3

Looking good here.

mov     w2, #5
tst     w0, w1
csel    w0, w2, w3, eq

@a74nh
Copy link
Contributor

a74nh commented Mar 23, 2023

With #79283 now merged, example 1 will produce ideal code.

#82031 is in progress to fix example 2.

@JulieLeeMSFT
Copy link
Member

@a74nh are we going to address example 3-5 as well in the upcoming months?

@a74nh
Copy link
Contributor

a74nh commented Mar 28, 2023

@a74nh are we going to address example 3-5 as well in the upcoming months?

Example 3 - Needs work
Example 4 - Fixed by #77728
Example 5 - Fixed by #77728

Having a quick look at example 3, it only saves a single mov, but it should be fairly easy to implement as it can fit into the existing csel work. I'll get @SwapnilGaikwad to look at this in Q2 so that we can close this issue.

There is also the option of using CINV for the equivalent of:

void Test(int& op1, int& op2) {
    op1 = op1 > op2 ? ~op1 : op1;
}

I suspect instances of this are low. Currently it generates mvn, cmp, csel which can be optimised to cmp, cinv, so again just a single instruction. I'll add this as a low priority item on our list.

@kunalspathak
Copy link
Member

I verified all the examples and they generate expected code. Thank you @a74nh , @SwapnilGaikwad and @jakobbotsch !

C# examples
[MethodImpl(MethodImplOptions.NoInlining)]
        static int Example1(int op1, int op2) {
            if (op1 > 0 && op2 > 0) {
                op1 = 5;
            }
            else {
                op1 = 10;
            }
            return op1;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static int Example2(int op1, int op2) {
            return op1 > 0 ? 5 : 6;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static int Example3(int op1, int op2) {
            return (op1 > 5) ? 0 : 1;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static int Example4(int op1, int op2, int xyz, int def) {
            return op1 > 0 ? xyz : def;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static int Example5(int op1, int op2, int xyz, int def) {
            return ((op1 & op2) == 0) ? 5 : def;
        }
Assembly code
Inside TLS()
; Assembly listing for method helloworld.TLS:Example1(int,int):int
; Emitting BLENDED_CODE for generic ARM64 - Windows
; optimized code
; fp based frame
; partially interruptible
; No PGO data
; invoked as altjit
; Final local variable assignments
;
;  V00 arg0         [V00,T00] (  5,  5   )     int  ->   x0        
;  V01 arg1         [V01,T01] (  3,  3   )     int  ->   x1         single-def
;# V02 OutArgs      [V02    ] (  1,  1   )  struct ( 0) [sp+00H]   do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0

G_M60152_IG01:  ;; offset=0000H
            stp     fp, lr, [sp, #-0x10]!
            mov     fp, sp
						;; size=8 bbWeight=1 PerfScore 1.50
G_M60152_IG02:  ;; offset=0008H
            mov     w2, #10
            mov     w3, #5
            cmp     w0, #0
            ccmp    w1, #0, nzc, gt
            csel    w0, w2, w3, le
						;; size=20 bbWeight=1 PerfScore 2.50
G_M60152_IG03:  ;; offset=001CH
            ldp     fp, lr, [sp], #0x10
            ret     lr
						;; size=8 bbWeight=1 PerfScore 2.00

; Total bytes of code 36, prolog size 8, PerfScore 9.60, instruction count 9, allocated bytes for code 36 (MethodHash=5f6e1507) for method helloworld.TLS:Example1(int,int):int
; ============================================================

; Assembly listing for method helloworld.TLS:Example2(int,int):int
; Emitting BLENDED_CODE for generic ARM64 - Windows
; optimized code
; fp based frame
; partially interruptible
; No PGO data
; invoked as altjit
; Final local variable assignments
;
;  V00 arg0         [V00,T00] (  3,  3   )     int  ->   x0         single-def
;* V01 arg1         [V01    ] (  0,  0   )     int  ->  zero-ref    single-def
;# V02 OutArgs      [V02    ] (  1,  1   )  struct ( 0) [sp+00H]   do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0

G_M31387_IG01:  ;; offset=0000H
            stp     fp, lr, [sp, #-0x10]!
            mov     fp, sp
						;; size=8 bbWeight=1 PerfScore 1.50
G_M31387_IG02:  ;; offset=0008H
            mov     w1, #5
            cmp     w0, #0
            cinc    w0, w1, le
						;; size=12 bbWeight=1 PerfScore 1.50
G_M31387_IG03:  ;; offset=0014H
            ldp     fp, lr, [sp], #0x10
            ret     lr
						;; size=8 bbWeight=1 PerfScore 2.00

; Total bytes of code 28, prolog size 8, PerfScore 7.80, instruction count 7, allocated bytes for code 28 (MethodHash=8c118564) for method helloworld.TLS:Example2(int,int):int
; ============================================================

; Assembly listing for method helloworld.TLS:Example3(int,int):int
; Emitting BLENDED_CODE for generic ARM64 - Windows
; optimized code
; fp based frame
; partially interruptible
; No PGO data
; invoked as altjit
; Final local variable assignments
;
;  V00 arg0         [V00,T00] (  3,  3   )     int  ->   x0         single-def
;* V01 arg1         [V01    ] (  0,  0   )     int  ->  zero-ref    single-def
;# V02 OutArgs      [V02    ] (  1,  1   )  struct ( 0) [sp+00H]   do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0

G_M43834_IG01:  ;; offset=0000H
            stp     fp, lr, [sp, #-0x10]!
            mov     fp, sp
						;; size=8 bbWeight=1 PerfScore 1.50
G_M43834_IG02:  ;; offset=0008H
            cmp     w0, #5
            cset    x0, le
						;; size=8 bbWeight=1 PerfScore 1.00
G_M43834_IG03:  ;; offset=0010H
            ldp     fp, lr, [sp], #0x10
            ret     lr
						;; size=8 bbWeight=1 PerfScore 2.00

; Total bytes of code 24, prolog size 8, PerfScore 6.90, instruction count 6, allocated bytes for code 24 (MethodHash=993b54c5) for method helloworld.TLS:Example3(int,int):int
; ============================================================

; Assembly listing for method helloworld.TLS:Example4(int,int,int,int):int
; Emitting BLENDED_CODE for generic ARM64 - Windows
; optimized code
; fp based frame
; partially interruptible
; No PGO data
; invoked as altjit
; Final local variable assignments
;
;  V00 arg0         [V00,T00] (  3,  3   )     int  ->   x0         single-def
;* V01 arg1         [V01    ] (  0,  0   )     int  ->  zero-ref    single-def
;  V02 arg2         [V02,T01] (  3,  3   )     int  ->   x2         single-def
;  V03 arg3         [V03,T02] (  3,  3   )     int  ->   x3         single-def
;# V04 OutArgs      [V04    ] (  1,  1   )  struct ( 0) [sp+00H]   do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0

G_M62429_IG01:  ;; offset=0000H
            stp     fp, lr, [sp, #-0x10]!
            mov     fp, sp
						;; size=8 bbWeight=1 PerfScore 1.50
G_M62429_IG02:  ;; offset=0008H
            cmp     w0, #0
            csel    w0, w2, w3, gt
						;; size=8 bbWeight=1 PerfScore 1.00
G_M62429_IG03:  ;; offset=0010H
            ldp     fp, lr, [sp], #0x10
            ret     lr
						;; size=8 bbWeight=1 PerfScore 2.00

; Total bytes of code 24, prolog size 8, PerfScore 6.90, instruction count 6, allocated bytes for code 24 (MethodHash=79970c22) for method helloworld.TLS:Example4(int,int,int,int):int
; ============================================================

; Assembly listing for method helloworld.TLS:Example5(int,int,int,int):int
; Emitting BLENDED_CODE for generic ARM64 - Windows
; optimized code
; fp based frame
; partially interruptible
; No PGO data
; invoked as altjit
; Final local variable assignments
;
;  V00 arg0         [V00,T00] (  3,  3   )     int  ->   x0         single-def
;  V01 arg1         [V01,T01] (  3,  3   )     int  ->   x1         single-def
;* V02 arg2         [V02    ] (  0,  0   )     int  ->  zero-ref    single-def
;  V03 arg3         [V03,T02] (  3,  3   )     int  ->   x3         single-def
;# V04 OutArgs      [V04    ] (  1,  1   )  struct ( 0) [sp+00H]   do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0

G_M9340_IG01:  ;; offset=0000H
            stp     fp, lr, [sp, #-0x10]!
            mov     fp, sp
						;; size=8 bbWeight=1 PerfScore 1.50
G_M9340_IG02:  ;; offset=0008H
            mov     w2, #5
            tst     w0, w1
            csel    w0, w2, w3, eq
						;; size=12 bbWeight=1 PerfScore 1.50
G_M9340_IG03:  ;; offset=0014H
            ldp     fp, lr, [sp], #0x10
            ret     lr
						;; size=8 bbWeight=1 PerfScore 2.00

; Total bytes of code 28, prolog size 8, PerfScore 7.80, instruction count 7, allocated bytes for code 28 (MethodHash=1640db83) for method helloworld.TLS:Example5(int,int,int,int):int
; ============================================================

@ghost ghost locked as resolved and limited conversation to collaborators Jun 25, 2023
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI Bottom Up Work Not part of a theme, epic, or user story
Projects
Archived in project
Development

Successfully merging a pull request may close this issue.

6 participants
@a74nh @terrajobst @echesakov @kunalspathak @JulieLeeMSFT and others