Arm64: Generate conditional comparison and selection instructions #55364

echesakov · 2021-07-08T20:15:49Z

Arm64 provides branchless conditional selection and comparison instructions that should be utilized by RyuJIT in the code it generates.

Reference: https://eclecticlight.co/2021/07/20/code-in-arm-assembly-conditions-without-branches/

RyuJIT already has support for them as seen below:

runtime/src/coreclr/jit/instrsarm64.h

Lines 1353 to 1375 in f0b7773

    
           INST1(csel,        "csel",         0,      IF_DR_3D,  0x1A800000) 
        
                                              //  csel    Rd,Rn,Rm,cond        DR_3D  X0011010100mmmmm cccc00nnnnnddddd   1A80 0000   cond 
        
           INST1(csinc,       "csinc",        0,      IF_DR_3D,  0x1A800400) 
        
                                              //  csinc   Rd,Rn,Rm,cond        DR_3D  X0011010100mmmmm cccc01nnnnnddddd   1A80 0400   cond 
        
           INST1(csinv,       "csinv",        0,      IF_DR_3D,  0x5A800000) 
        
                                              //  csinv   Rd,Rn,Rm,cond        DR_3D  X1011010100mmmmm cccc00nnnnnddddd   5A80 0000   cond 
        
           INST1(csneg,       "csneg",        0,      IF_DR_3D,  0x5A800400) 
        
                                              //  csneg   Rd,Rn,Rm,cond        DR_3D  X1011010100mmmmm cccc01nnnnnddddd   5A80 0400   cond 
        
           INST1(cinc,        "cinc",         0,      IF_DR_2D,  0x1A800400) 
        
                                              //  cinc    Rd,Rn,cond           DR_2D  X0011010100nnnnn cccc01nnnnnddddd   1A80 0400   cond 
        
           INST1(cinv,        "cinv",         0,      IF_DR_2D,  0x5A800000) 
        
                                              //  cinv    Rd,Rn,cond           DR_2D  X1011010100nnnnn cccc00nnnnnddddd   5A80 0000   cond 
        
           INST1(cneg,        "cneg",         0,      IF_DR_2D,  0x5A800400) 
        
                                              //  cneg    Rd,Rn,cond           DR_2D  X1011010100nnnnn cccc01nnnnnddddd   5A80 0400   cond 
        
           INST1(cset,        "cset",         0,      IF_DR_1D,  0x1A9F07E0) 
        
                                              //  cset    Rd,cond              DR_1D  X001101010011111 cccc0111111ddddd   1A9F 07E0   Rd cond

runtime/src/coreclr/jit/instrsarm64.h

Lines 633 to 639 in f0b7773

    
           INST2(ccmp,        "ccmp",         CMP,    IF_EN2F,   0x7A400000,  0x7A400800) 
        
                                              //  ccmp    Rn,Rm,  nzcv,cond    DR_2I  X1111010010mmmmm cccc00nnnnn0nzcv   7A40 0000         nzcv, cond 
        
                                              //  ccmp    Rn,imm5,nzcv,cond    DI_1F  X1111010010iiiii cccc10nnnnn0nzcv   7A40 0800   imm5, nzcv, cond 
        
           INST2(ccmn,        "ccmn",         CMP,    IF_EN2F,   0x3A400000,  0x3A400800) 
        
                                              //  ccmn    Rn,Rm,  nzcv,cond    DR_2I  X0111010010mmmmm cccc00nnnnn0nzcv   3A40 0000         nzcv, cond 
        
                                              //  ccmn    Rn,imm5,nzcv,cond    DI_1F  X0111010910iiiii cccc10nnnnn0nzcv   3A40 0800   imm5, nzcv, cond

Currently, the method emitIns_R_R_R_COND and emitIns_R_I_FLAGS_COND that produces these instructions are not utilized at all. emitIns_R_R_R_COND was recently used in #66407 to generate csneg instruction. Once these instructions are used, we could produce much better code. Below are some examples:

Example# 1:

    static void Test(uint op1, uint op2) {
        if (op1 > 0 && op2 > 0) {
            op1 = 5;
        }  else {
            op1 = 10;
        }
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/5ov9TKx6P
Current code:

G_M2878_IG01:
            stp     fp, lr, [sp,#-16]!
            mov     fp, sp
                                                ;; bbWeight=1    PerfScore 1.50
G_M2878_IG02:
            cbz     w0, G_M2878_IG04
                                                ;; bbWeight=1    PerfScore 1.00
G_M2878_IG03:
            cbz     w1, G_M2878_IG04
            mov     w0, #5
            b       G_M2878_IG05
                                                ;; bbWeight=0.50 PerfScore 1.25
G_M2878_IG04:
            mov     w0, #10
                                                ;; bbWeight=0.50 PerfScore 0.25
G_M2878_IG05:
            bl      _12219:Consume(int,int)
                                                ;; bbWeight=1    PerfScore 1.00
G_M2878_IG06:
            ldp     fp, lr, [sp],#16
            ret     lr
                                                ;; bbWeight=1    PerfScore 2.00

Example# 2:

    static void Test(uint op1, uint op2) {
        op1 = op1 > 0 ? 5 : 6;
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/GTnc4jjfG
Current code:

G_M9565_IG01:
            stp     fp, lr, [sp,#-16]!
            mov     fp, sp
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG02:
            cmp     w0, #0
            bgt     G_M9565_IG04
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG03:
            mov     w0, #6
            b       G_M9565_IG05
                                                ;; bbWeight=0.50 PerfScore 0.75
G_M9565_IG04:
            mov     w0, #5
                                                ;; bbWeight=0.50 PerfScore 0.25
G_M9565_IG05:
            bl      _12219:Consume(int,int)
                                                ;; bbWeight=1    PerfScore 1.00
G_M9565_IG06:
            ldp     fp, lr, [sp],#16
            ret     lr
                                                ;; bbWeight=1    PerfScore 2.00

Example# 3:

    static void Test(uint op1, uint op2) {
        op1 = (op1 > 0) ? 0 : 1;
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/GoqcsM1Tf
Current code:

G_M9565_IG01:
            stp     fp, lr, [sp,#-16]!
            mov     fp, sp
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG02:
            cmp     w0, #0
            bgt     G_M9565_IG04
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG03:
            mov     w0, #1
            b       G_M9565_IG05
                                                ;; bbWeight=0.50 PerfScore 0.75
G_M9565_IG04:
            mov     w0, wzr
                                                ;; bbWeight=0.50 PerfScore 0.25
G_M9565_IG05:
            bl      _12219:Consume(int,int)
                                                ;; bbWeight=1    PerfScore 1.00
G_M9565_IG06:
            ldp     fp, lr, [sp],#16
            ret     lr
                                                ;; bbWeight=1    PerfScore 2.00

Example# 4:

    static void Test(uint op1, uint op2, uint xyz, uint def) {
        op1 = op1 > 0 ? xyz : def;
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/1EfxPn48q
Current code:

G_M9565_IG01:
            stp     fp, lr, [sp,#-16]!
            mov     fp, sp
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG02:
            cbnz    w0, G_M9565_IG04
                                                ;; bbWeight=1    PerfScore 1.00
G_M9565_IG03:
            b       G_M9565_IG05
                                                ;; bbWeight=0.50 PerfScore 0.50
G_M9565_IG04:
            mov     w3, w2
                                                ;; bbWeight=0.50 PerfScore 0.25
G_M9565_IG05:
            mov     w0, w3
            bl      _12219:Consume(int,int)
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG06:
            ldp     fp, lr, [sp],#16
            ret     lr
                                                ;; bbWeight=1    PerfScore 2.00

Example# 5:

    static void Test(int op1, int op2, int xyz, int def) {
        op1 = ((op1 & op2) == 0) ? 5 : def;
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/fc3eddPx3
Current code:

G_M9565_IG01:
            stp     fp, lr, [sp,#-16]!
            mov     fp, sp
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG02:
            tst     w0, w1
            beq     G_M9565_IG04
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG03:
            b       G_M9565_IG05
                                                ;; bbWeight=0.50 PerfScore 0.50
G_M9565_IG04:
            mov     w3, #5
                                                ;; bbWeight=0.50 PerfScore 0.25
G_M9565_IG05:
            mov     w0, w3
            bl      _12219:Consume(int,int)
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG06:
            ldp     fp, lr, [sp],#16
            ret     lr
                                                ;; bbWeight=1    PerfScore 2.00

@TamarChristinaArm

Some related issues:

RyuJit: avoid conditional jumps using cmov and similar instructions #6749 RyuJit: avoid conditional jumps using cmov and similar instructions
RyuJIT: Optimize "X / POW2_CNS" via cmovns #41549 RyuJIT: Optimize "X / POW2_CNS" via cmovns
[RyuJIT][arm64] Optimize "x<0" and "x>=0" #43440 [RyuJIT][arm64] Optimize "x<0" and "x>=0"

Presumably, some parts of the analysis can be implemented in platform agnostic way and benefit both Arm64 and X86 platforms.

category:cq
theme:intrinsics
skill-level:expert
cost:large
impact:medium

The text was updated successfully, but these errors were encountered:

a74nh · 2022-03-16T12:33:47Z

I've been taking a look at this.

Starting with:

        static void TransformsIntoCondBr(uint op1, uint op2) {
            if ((op1 & op2) == 0) {
                op1 = op2;
            } else {
                op2 = op1;
            }
            Console.WriteLine("{0}, {1}", op1, op2);
        }

Gives the CIL:

STMT00000 ( 0x000[E-] ... ??? )
               [000005] ------------              *  JTRUE     void  
               [000004] ------------              \--*  NE        int   
               [000002] ------------                 +--*  AND       int   
               [000000] ------------                 |  +--*  LCL_VAR   int    V00 arg0         
               [000001] ------------                 |  \--*  LCL_VAR   int    V01 arg1         
               [000003] ------------                 \--*  CNS_INT   int    0

Which becomes:

                                                  /--*  t2     int    
                                                  +--*  t3     int    
N005 (  9,  8) [000004] J------N----         t4 = *  NE        int   
                                                  /--*  t4     int    
N006 ( 11, 10) [000005] ------------              *  JTRUE     void

When running tier1
It uses the code in Lowering::OptimizeConstCompare() to change NE to TEST_NE, and generates optimal code:

                                                              /--*  t0     int    
                                                              +--*  t1     int    
Generating: N009 (  5,  6) [000004] J------N----              *  TEST_NE   void   REG NA $101
IN0001:                           tst     w19, w20
Generating: N011 (  7,  8) [000005] ------------              *  JTRUE     void   REG NA $VN.Void
IN0002:                           bne     (LARGEJMP)L_M63148_BB03

When running tier0
Does not call into OptimizeConstCompare, due to MinOpts check failing.
Instead, LowerJTrue() will get called, creating a JCMP node, which generates:

                                                              /--*  t0     int    
                                                              +--*  t1     int    
Generating: N008 (  7,  5) [000002] ------------         t2 = *  AND       int    REG x0
IN0003:                           and     w0, w0, w1
Generating: N010 (  1,  2) [000003] -c----------         t3 =    CNS_INT   int    0 REG NA
                                                              /--*  t2     int    
                                                              +--*  t3     int    
Generating: N012 (  9,  8) [000004] CNE-------N----              *  JCMP      void   REG NA
IN0004:                           cbnz    (LARGEJMP)L_M63148_BB03

Delete LowerJTrue(), then run tier0
There is no lowering, and everything emits as we'd expect:

                                                              /--*  t0     int    
                                                              +--*  t1     int    
Generating: N008 (  7,  5) [000002] ------------         t2 = *  AND       int    REG x0
IN0003:                           and     w0, w0, w1
Generating: N010 (  1,  2) [000003] -c----------         t3 =    CNS_INT   int    0 REG NA
                                                              /--*  t2     int    
                                                              +--*  t3     int    
Generating: N012 (  9,  8) [000004] J------N----              *  NE        void   REG NA
IN0004:                           cmp     w0, #0
Generating: N014 ( 11, 10) [000005] ------------              *  JTRUE     void   REG NA
IN0005:                           bne     (LARGEJMP)L_M63148_BB03

At first glance, this looks correct. However, the LowerJTrue feels wrong.
OptimizeConstCompare is skipped on tier0 due to the cost of generating the nodes.
Why isn't LowerJTrue skipped on tier0 too? As far as I can tell, the cost of calling LowerJTrue is similar to OptimizeConstCompare?

Would it make sense to

Enable the single optimisation in OptimizeConstCompare on tier0, and remove LowerJTrue.
This gives us optimal code in all cases
or
Remove LowerJTrue.
This simplifies tier0 at the cost of worse code

There's probably some subtleties I'm missing (I'm not sure if OptimizeConstCompare catches all the cases LowerJTrue does). And I've not run any performance testing on any of the above.

If neither the above hold, then this issue can be closed?

kunalspathak · 2022-03-16T15:31:46Z

When running tier1
It uses the code in Lowering::OptimizeConstCompare() to change NE to TEST_NE, and generates optimal code:

That's correct, we do generate optimized code, so I have updated the PR description to reflect the current problem.

G_M9565_IG01:
            stp     fp, lr, [sp,#-16]!
            mov     fp, sp
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG02:
            tst     w0, w1
            bne     G_M9565_IG04
                                                ;; bbWeight=1    PerfScore 1.50
G_M9565_IG03:
            mov     w0, w1
            b       G_M9565_IG05
                                                ;; bbWeight=0.50 PerfScore 0.75
G_M9565_IG04:
            mov     w1, w0
                                                ;; bbWeight=0.50 PerfScore 0.25
G_M9565_IG05:
            bl      _12219:Consume(int,int)
                                                ;; bbWeight=1    PerfScore 1.00
G_M9565_IG06:
            ldp     fp, lr, [sp],#16
            ret     lr
                                                ;; bbWeight=1    PerfScore 2.00

Enable the single optimisation in OptimizeConstCompare on tier0, and remove LowerJTrue

We try to do minimum optimization in tier0 for speedup and simplicity, so we mostly focus on improving the tier1 code.

a74nh · 2022-03-18T12:41:13Z

Ok, so trying to break this down into a first step:

if (op1 > 0) {
  op1 = 5;
}

Ends up as two basic blocks:

G_M46575_IG02:   
IN0001: 000008                    cmp     w0, #0
IN0002: 00000C                    beq     G_M46575_IG04
G_M46575_IG03: 
IN0003: 000010                    mov     w0, #5

That needs optimising optimising to:

G_M46575_IG02
cmp     w0, #0
mov     w5, #5
csel    w0, w5, w0, ne
G_M46575_IG03

If done correctly, then that should hopefully get rid of all the branches in all the examples above, which should get a large portion of the performance. We can then look at generating the other conditionals and combining instructions in the other examples.

a74nh · 2022-04-08T14:03:30Z

I've been speaking to various people within Arm, and the benefit for switching to using csel (and friends) for AArch64 isn't obvious. This is due to modern branch prediction. Branches are predicted many cycles before the condition is evaluated (and before the branch itself is even fetched), so if the prediction is accurate, it results in significant speedups when using branches. In addition, dependency chains on the csel, especially when the result of the csel is required in the next iteration, can significantly slow down csel compared to using branches.

Note that GCC and LLVM make cost based choices on when to use csel. LLVM is considering changing their approach too ( https://discourse.llvm.org/t/rfc-cmov-vs-branch-optimization/6040 ). Of course for a jit, we need the cost of generating the costs to be lightweight.

The current advice is:

for cases outside of a loop, use csel.
for cases inside a loop, use branches
Once the above is implemented, then consider adding a cost based approach for using csel inside loops.

The performance impact of the above is likely to be small.

However, every use of csel will reduce code size. AIUI, this is a concern for .Net, so where performance between the two options is the same, then csel should be preferred.

AIUI, X86 has a similar behaviour, but I'm not sure how close.

@TamarChristinaArm for reference

JulieLeeMSFT · 2022-06-06T16:54:16Z

#67894

JulieLeeMSFT · 2022-08-11T17:59:29Z

Moved to .NET 8 to finish the remaining work.

a74nh · 2022-12-14T12:01:21Z

With the merging of #77728, some of the examples are now looking much better. There are still missing bits.

Example# 1:

    static void Test(uint op1, uint op2) {
        if (op1 > 0 && op2 > 0) {
            op1 = 5;
        }  else {
            op1 = 10;
        }
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/5ov9TKx6P

The op2 check should generate to a CMP and CSEL.
The op1 check will still be a CMP and branch.

This should be fully fixed by #79283

It won't catch when the else case is a different target to the if case, eg:

if (op1 > 0 && op2 > 0) { op1 = 5; } else { op2 = 10; }

Example# 2:

    static void Test(uint op1, uint op2) {
        op1 = op1 > 0 ? 5 : 6;
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/GTnc4jjfG Current code:

This now will do:

MOV x1, 5
MOV x2, 6
CMP x1, 0
CSEL x1, x1, x2, gt

To make this ideal, we'd have to detect the 6 is 1 greater than the 5:

MOV x1, 5
CMP x1, 0
CINC x1, x1, gt

Should be a fairly straightforward to do via lowering/containing.

Example# 3:

    static void Test(uint op1, uint op2) {
        op1 = (op1 > 0) ? 0 : 1;
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/GoqcsM1Tf

We'll generate:

mov     w2, #1
cmp     w0, #0
csel    w0, wzr, w2, ne

Switching the CSEL to a CSET would get rid of the MOV. Suspect this would need changes in If Conversion pass

Example# 4:

    static void Test(uint op1, uint op2, uint xyz, uint def) {
        op1 = op1 > 0 ? xyz : def;
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/1EfxPn48q

This is ideal now. CMP followed by CSEL.

Example# 5:

    static void Test(int op1, int op2, int xyz, int def) {
        op1 = ((op1 & op2) == 0) ? 5 : def;
        Consume(op1, op2);
    }

Ideal code: https://godbolt.org/z/fc3eddPx3

Looking good here.

mov     w2, #5
tst     w0, w1
csel    w0, w2, w3, eq

a74nh · 2023-03-23T14:09:48Z

With #79283 now merged, example 1 will produce ideal code.

#82031 is in progress to fix example 2.

JulieLeeMSFT · 2023-03-27T20:34:45Z

@a74nh are we going to address example 3-5 as well in the upcoming months?

a74nh · 2023-03-28T10:22:01Z

@a74nh are we going to address example 3-5 as well in the upcoming months?

Example 3 - Needs work
Example 4 - Fixed by #77728
Example 5 - Fixed by #77728

Having a quick look at example 3, it only saves a single mov, but it should be fairly easy to implement as it can fit into the existing csel work. I'll get @SwapnilGaikwad to look at this in Q2 so that we can close this issue.

There is also the option of using CINV for the equivalent of:

void Test(int& op1, int& op2) {
    op1 = op1 > op2 ? ~op1 : op1;
}

I suspect instances of this are low. Currently it generates mvn, cmp, csel which can be optimised to cmp, cinv, so again just a single instruction. I'll add this as a low priority item on our list.

kunalspathak · 2023-05-26T00:37:09Z

I verified all the examples and they generate expected code. Thank you @a74nh , @SwapnilGaikwad and @jakobbotsch !

C# examples

[MethodImpl(MethodImplOptions.NoInlining)]
        static int Example1(int op1, int op2) {
            if (op1 > 0 && op2 > 0) {
                op1 = 5;
            }
            else {
                op1 = 10;
            }
            return op1;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static int Example2(int op1, int op2) {
            return op1 > 0 ? 5 : 6;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static int Example3(int op1, int op2) {
            return (op1 > 5) ? 0 : 1;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static int Example4(int op1, int op2, int xyz, int def) {
            return op1 > 0 ? xyz : def;
        }

        [MethodImpl(MethodImplOptions.NoInlining)]
        static int Example5(int op1, int op2, int xyz, int def) {
            return ((op1 & op2) == 0) ? 5 : def;
        }

Assembly code

Inside TLS()
; Assembly listing for method helloworld.TLS:Example1(int,int):int
; Emitting BLENDED_CODE for generic ARM64 - Windows
; optimized code
; fp based frame
; partially interruptible
; No PGO data
; invoked as altjit
; Final local variable assignments
;
;  V00 arg0         [V00,T00] (  5,  5   )     int  ->   x0        
;  V01 arg1         [V01,T01] (  3,  3   )     int  ->   x1         single-def
;# V02 OutArgs      [V02    ] (  1,  1   )  struct ( 0) [sp+00H]   do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0

G_M60152_IG01:  ;; offset=0000H
            stp     fp, lr, [sp, #-0x10]!
            mov     fp, sp
						;; size=8 bbWeight=1 PerfScore 1.50
G_M60152_IG02:  ;; offset=0008H
            mov     w2, #10
            mov     w3, #5
            cmp     w0, #0
            ccmp    w1, #0, nzc, gt
            csel    w0, w2, w3, le
						;; size=20 bbWeight=1 PerfScore 2.50
G_M60152_IG03:  ;; offset=001CH
            ldp     fp, lr, [sp], #0x10
            ret     lr
						;; size=8 bbWeight=1 PerfScore 2.00

; Total bytes of code 36, prolog size 8, PerfScore 9.60, instruction count 9, allocated bytes for code 36 (MethodHash=5f6e1507) for method helloworld.TLS:Example1(int,int):int
; ============================================================

; Assembly listing for method helloworld.TLS:Example2(int,int):int
; Emitting BLENDED_CODE for generic ARM64 - Windows
; optimized code
; fp based frame
; partially interruptible
; No PGO data
; invoked as altjit
; Final local variable assignments
;
;  V00 arg0         [V00,T00] (  3,  3   )     int  ->   x0         single-def
;* V01 arg1         [V01    ] (  0,  0   )     int  ->  zero-ref    single-def
;# V02 OutArgs      [V02    ] (  1,  1   )  struct ( 0) [sp+00H]   do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0

G_M31387_IG01:  ;; offset=0000H
            stp     fp, lr, [sp, #-0x10]!
            mov     fp, sp
						;; size=8 bbWeight=1 PerfScore 1.50
G_M31387_IG02:  ;; offset=0008H
            mov     w1, #5
            cmp     w0, #0
            cinc    w0, w1, le
						;; size=12 bbWeight=1 PerfScore 1.50
G_M31387_IG03:  ;; offset=0014H
            ldp     fp, lr, [sp], #0x10
            ret     lr
						;; size=8 bbWeight=1 PerfScore 2.00

; Total bytes of code 28, prolog size 8, PerfScore 7.80, instruction count 7, allocated bytes for code 28 (MethodHash=8c118564) for method helloworld.TLS:Example2(int,int):int
; ============================================================

; Assembly listing for method helloworld.TLS:Example3(int,int):int
; Emitting BLENDED_CODE for generic ARM64 - Windows
; optimized code
; fp based frame
; partially interruptible
; No PGO data
; invoked as altjit
; Final local variable assignments
;
;  V00 arg0         [V00,T00] (  3,  3   )     int  ->   x0         single-def
;* V01 arg1         [V01    ] (  0,  0   )     int  ->  zero-ref    single-def
;# V02 OutArgs      [V02    ] (  1,  1   )  struct ( 0) [sp+00H]   do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0

G_M43834_IG01:  ;; offset=0000H
            stp     fp, lr, [sp, #-0x10]!
            mov     fp, sp
						;; size=8 bbWeight=1 PerfScore 1.50
G_M43834_IG02:  ;; offset=0008H
            cmp     w0, #5
            cset    x0, le
						;; size=8 bbWeight=1 PerfScore 1.00
G_M43834_IG03:  ;; offset=0010H
            ldp     fp, lr, [sp], #0x10
            ret     lr
						;; size=8 bbWeight=1 PerfScore 2.00

; Total bytes of code 24, prolog size 8, PerfScore 6.90, instruction count 6, allocated bytes for code 24 (MethodHash=993b54c5) for method helloworld.TLS:Example3(int,int):int
; ============================================================

; Assembly listing for method helloworld.TLS:Example4(int,int,int,int):int
; Emitting BLENDED_CODE for generic ARM64 - Windows
; optimized code
; fp based frame
; partially interruptible
; No PGO data
; invoked as altjit
; Final local variable assignments
;
;  V00 arg0         [V00,T00] (  3,  3   )     int  ->   x0         single-def
;* V01 arg1         [V01    ] (  0,  0   )     int  ->  zero-ref    single-def
;  V02 arg2         [V02,T01] (  3,  3   )     int  ->   x2         single-def
;  V03 arg3         [V03,T02] (  3,  3   )     int  ->   x3         single-def
;# V04 OutArgs      [V04    ] (  1,  1   )  struct ( 0) [sp+00H]   do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0

G_M62429_IG01:  ;; offset=0000H
            stp     fp, lr, [sp, #-0x10]!
            mov     fp, sp
						;; size=8 bbWeight=1 PerfScore 1.50
G_M62429_IG02:  ;; offset=0008H
            cmp     w0, #0
            csel    w0, w2, w3, gt
						;; size=8 bbWeight=1 PerfScore 1.00
G_M62429_IG03:  ;; offset=0010H
            ldp     fp, lr, [sp], #0x10
            ret     lr
						;; size=8 bbWeight=1 PerfScore 2.00

; Total bytes of code 24, prolog size 8, PerfScore 6.90, instruction count 6, allocated bytes for code 24 (MethodHash=79970c22) for method helloworld.TLS:Example4(int,int,int,int):int
; ============================================================

; Assembly listing for method helloworld.TLS:Example5(int,int,int,int):int
; Emitting BLENDED_CODE for generic ARM64 - Windows
; optimized code
; fp based frame
; partially interruptible
; No PGO data
; invoked as altjit
; Final local variable assignments
;
;  V00 arg0         [V00,T00] (  3,  3   )     int  ->   x0         single-def
;  V01 arg1         [V01,T01] (  3,  3   )     int  ->   x1         single-def
;* V02 arg2         [V02    ] (  0,  0   )     int  ->  zero-ref    single-def
;  V03 arg3         [V03,T02] (  3,  3   )     int  ->   x3         single-def
;# V04 OutArgs      [V04    ] (  1,  1   )  struct ( 0) [sp+00H]   do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
;
; Lcl frame size = 0

G_M9340_IG01:  ;; offset=0000H
            stp     fp, lr, [sp, #-0x10]!
            mov     fp, sp
						;; size=8 bbWeight=1 PerfScore 1.50
G_M9340_IG02:  ;; offset=0008H
            mov     w2, #5
            tst     w0, w1
            csel    w0, w2, w3, eq
						;; size=12 bbWeight=1 PerfScore 1.50
G_M9340_IG03:  ;; offset=0014H
            ldp     fp, lr, [sp], #0x10
            ret     lr
						;; size=8 bbWeight=1 PerfScore 2.00

; Total bytes of code 28, prolog size 8, PerfScore 7.80, instruction count 7, allocated bytes for code 28 (MethodHash=1640db83) for method helloworld.TLS:Example5(int,int,int,int):int
; ============================================================

echesakov added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Jul 8, 2021

echesakov added this to the Future milestone Jul 8, 2021

dotnet-issue-labeler bot added the untriaged New issue has not been triaged by the area owner label Jul 8, 2021

echesakov removed the untriaged New issue has not been triaged by the area owner label Jul 8, 2021

echesakov mentioned this issue Jul 8, 2021

[Arm64] Planned JIT work in .NET 6 #43629

Closed

29 tasks

JulieLeeMSFT added the User Story A single user-facing feature. Can be grouped under an epic. label Jul 8, 2021

JulieLeeMSFT assigned echesakov Jul 8, 2021

terrajobst added the Bottom Up Work Not part of a theme, epic, or user story label Jul 23, 2021

EgorBo mentioned this issue Nov 11, 2021

Performance Improvement - Ternary operator with constants #61480

Closed

echesakov modified the milestones: Future, 7.0.0 Feb 5, 2022

JulieLeeMSFT assigned kunalspathak and unassigned echesakov Mar 10, 2022

kunalspathak changed the title ~~Conditional instructions/branch elimination~~ Arm64: Generate conditional comparison and selection instructions Mar 16, 2022

JulieLeeMSFT assigned a74nh Mar 21, 2022

kunalspathak mentioned this issue Mar 28, 2022

Improving ARM64 Performance in .NET 7.0 #64820

Closed

32 tasks

kunalspathak mentioned this issue Jun 6, 2022

Arm64: Use csel and ccmp for conditional moves #67894

Closed

ghost added the in-pr There is an active PR which will close this issue when it is merged label Jun 6, 2022

JulieLeeMSFT mentioned this issue Jul 28, 2022

What's new in .NET 7 Preview 7 [WIP] dotnet/core#7455

Closed

ghost removed the in-pr There is an active PR which will close this issue when it is merged label Aug 1, 2022

JulieLeeMSFT modified the milestones: 7.0.0, 8.0.0 Aug 11, 2022

kunalspathak mentioned this issue Oct 13, 2022

Improving Arm64 Performance in .NET 8.0 #77010

Closed

28 tasks

JulieLeeMSFT removed the User Story A single user-facing feature. Can be grouped under an epic. label Oct 14, 2022

This was referenced Feb 14, 2023

Arm64: Combine if conditions into compare chains #79283

Merged

Use cinc instead of csel when possible #82031

Merged

kunalspathak closed this as completed May 26, 2023

ghost locked as resolved and limited conversation to collaborators Jun 25, 2023

JulieLeeMSFT added this to .NET Core CodeGen Jun 5, 2024

JulieLeeMSFT moved this to Done in .NET Core CodeGen Jun 5, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Arm64: Generate conditional comparison and selection instructions #55364

Arm64: Generate conditional comparison and selection instructions #55364

echesakov commented Jul 8, 2021 •

edited by BruceForstall

Loading

a74nh commented Mar 16, 2022

kunalspathak commented Mar 16, 2022

a74nh commented Mar 18, 2022

a74nh commented Apr 8, 2022

JulieLeeMSFT commented Jun 6, 2022

JulieLeeMSFT commented Aug 11, 2022

a74nh commented Dec 14, 2022 •

edited

Loading

a74nh commented Mar 23, 2023

JulieLeeMSFT commented Mar 27, 2023

a74nh commented Mar 28, 2023

kunalspathak commented May 26, 2023

Arm64: Generate conditional comparison and selection instructions #55364

Arm64: Generate conditional comparison and selection instructions #55364

Comments

echesakov commented Jul 8, 2021 • edited by BruceForstall Loading

a74nh commented Mar 16, 2022

kunalspathak commented Mar 16, 2022

a74nh commented Mar 18, 2022

a74nh commented Apr 8, 2022

JulieLeeMSFT commented Jun 6, 2022

JulieLeeMSFT commented Aug 11, 2022

a74nh commented Dec 14, 2022 • edited Loading

a74nh commented Mar 23, 2023

JulieLeeMSFT commented Mar 27, 2023

a74nh commented Mar 28, 2023

kunalspathak commented May 26, 2023

echesakov commented Jul 8, 2021 •

edited by BruceForstall

Loading

a74nh commented Dec 14, 2022 •

edited

Loading