Ineffective codegen for or pattern type checking #47920

huoyaoyuan · 2021-02-05T18:00:09Z

Description

Examine this code:

using System;
public class C
{
    public bool M0(object o)
    {
        return o is int or uint or long;
    }
    public bool M1(object o)
    {
        return o != null && (o is int or uint or long);
    }
    public bool M2(object o)
    {
        return o != null && (o.GetType() == typeof(int) || o.GetType() == typeof(uint) || o.GetType() == typeof(long));
    }
}

IL Compiled:

.method public hidebysig 
        instance bool M0 (
            object o
        ) cil managed 
    {
        // Method begins at RVA 0x2050
        // Code size 28 (0x1c)
        .maxstack 8

        IL_0000: ldarg.1
        IL_0001: isinst [System.Private.CoreLib]System.Int32
        IL_0006: brtrue.s IL_001a

        IL_0008: ldarg.1
        IL_0009: isinst [System.Private.CoreLib]System.UInt32
        IL_000e: brtrue.s IL_001a

        IL_0010: ldarg.1
        IL_0011: isinst [System.Private.CoreLib]System.Int64
        IL_0016: ldnull
        IL_0017: cgt.un
        IL_0019: ret

        IL_001a: ldc.i4.1
        IL_001b: ret
    } // end of method C::M0

    .method public hidebysig 
        instance bool M1 (
            object o
        ) cil managed 
    {
        // Method begins at RVA 0x206d
        // Code size 33 (0x21)
        .maxstack 8

        IL_0000: ldarg.1
        IL_0001: brfalse.s IL_001f

        IL_0003: ldarg.1
        IL_0004: isinst [System.Private.CoreLib]System.Int32
        IL_0009: brtrue.s IL_001d

        IL_000b: ldarg.1
        IL_000c: isinst [System.Private.CoreLib]System.UInt32
        IL_0011: brtrue.s IL_001d

        IL_0013: ldarg.1
        IL_0014: isinst [System.Private.CoreLib]System.Int64
        IL_0019: ldnull
        IL_001a: cgt.un
        IL_001c: ret

        IL_001d: ldc.i4.1
        IL_001e: ret

        IL_001f: ldc.i4.0
        IL_0020: ret
    } // end of method C::M1

    .method public hidebysig 
        instance bool M2 (
            object o
        ) cil managed 
    {
        // Method begins at RVA 0x2090
        // Code size 75 (0x4b)
        .maxstack 2

        IL_0000: ldarg.1
        IL_0001: brfalse.s IL_0049

        IL_0003: ldarg.1
        IL_0004: callvirt instance class [System.Private.CoreLib]System.Type [System.Private.CoreLib]System.Object::GetType()
        IL_0009: ldtoken [System.Private.CoreLib]System.Int32
        IL_000e: call class [System.Private.CoreLib]System.Type [System.Private.CoreLib]System.Type::GetTypeFromHandle(valuetype [System.Private.CoreLib]System.RuntimeTypeHandle)
        IL_0013: call bool [System.Private.CoreLib]System.Type::op_Equality(class [System.Private.CoreLib]System.Type, class [System.Private.CoreLib]System.Type)
        IL_0018: brtrue.s IL_0047

        IL_001a: ldarg.1
        IL_001b: callvirt instance class [System.Private.CoreLib]System.Type [System.Private.CoreLib]System.Object::GetType()
        IL_0020: ldtoken [System.Private.CoreLib]System.UInt32
        IL_0025: call class [System.Private.CoreLib]System.Type [System.Private.CoreLib]System.Type::GetTypeFromHandle(valuetype [System.Private.CoreLib]System.RuntimeTypeHandle)
        IL_002a: call bool [System.Private.CoreLib]System.Type::op_Equality(class [System.Private.CoreLib]System.Type, class [System.Private.CoreLib]System.Type)
        IL_002f: brtrue.s IL_0047

        IL_0031: ldarg.1
        IL_0032: callvirt instance class [System.Private.CoreLib]System.Type [System.Private.CoreLib]System.Object::GetType()
        IL_0037: ldtoken [System.Private.CoreLib]System.Int64
        IL_003c: call class [System.Private.CoreLib]System.Type [System.Private.CoreLib]System.Type::GetTypeFromHandle(valuetype [System.Private.CoreLib]System.RuntimeTypeHandle)
        IL_0041: call bool [System.Private.CoreLib]System.Type::op_Equality(class [System.Private.CoreLib]System.Type, class [System.Private.CoreLib]System.Type)
        IL_0046: ret

        IL_0047: ldc.i4.1
        IL_0048: ret

        IL_0049: ldc.i4.0
        IL_004a: ret
    } // end of method C::M2

codegen:

C.M0(System.Object)
    L0000: mov rax, rdx
    L0003: test rax, rax
    L0006: je short L0017
    L0008: mov rcx, 0x7ff9125bb258
    L0012: cmp [rax], rcx
    L0015: jne short L001c
    L0017: test rdx, rdx
    L001a: jne short L0058
    L001c: mov rax, rdx
    L001f: test rax, rax
    L0022: je short L0033
    L0024: mov rcx, 0x7ff9125bbc18
    L002e: cmp [rax], rcx
    L0031: jne short L0038
    L0033: test rdx, rdx
    L0036: jne short L0058
    L0038: test rdx, rdx
    L003b: je short L004e
    L003d: mov rax, 0x7ff9125bc5d8
    L0047: cmp [rdx], rax
    L004a: je short L004e
    L004c: xor edx, edx
    L004e: test rdx, rdx
    L0051: setne al
    L0054: movzx eax, al
    L0057: ret
    L0058: mov eax, 1
    L005d: ret

C.M1(System.Object)
    L0000: test rdx, rdx
    L0003: je short L004a
    L0005: mov rax, rdx
    L0008: mov rcx, 0x7ff9125bb258
    L0012: cmp [rax], rcx
    L0015: je short L0044
    L0017: mov rax, rdx
    L001a: mov rcx, 0x7ff9125bbc18
    L0024: cmp [rax], rcx
    L0027: je short L0044
    L0029: mov rax, 0x7ff9125bc5d8
    L0033: cmp [rdx], rax
    L0036: je short L003a
    L0038: xor edx, edx
    L003a: test rdx, rdx
    L003d: setne al
    L0040: movzx eax, al
    L0043: ret
    L0044: mov eax, 1
    L0049: ret
    L004a: xor eax, eax
    L004c: ret

C.M2(System.Object)
    L0000: test rdx, rdx
    L0003: je short L003d
    L0005: mov rax, 0x7ff9125bb258
    L000f: cmp [rdx], rax
    L0012: je short L0037
    L0014: mov rax, 0x7ff9125bbc18
    L001e: cmp [rdx], rax
    L0021: je short L0037
    L0023: mov rax, 0x7ff9125bc5d8
    L002d: cmp [rdx], rax
    L0030: sete al
    L0033: movzx eax, al
    L0036: ret
    L0037: mov eax, 1
    L003c: ret
    L003d: xor eax, eax
    L003f: ret

sharplab

Looking through it:
The pattern matching codegen moves from rdx to rax for every test.
If there's no null check, JIT will generate null test for each condition.

The compiled IL of pattern matching uses ldnull and cgt for the last test. This pattern may confuse the JIT.

Configuration

sharplab default x64 (5.0 timeframe)

Regression?

Probably not.

category:cq
theme:redundant-branches
skill-level:expert
cost:medium
impact:small

The text was updated successfully, but these errors were encountered:

AndyAyersMS · 2021-02-05T18:57:40Z

Did you have a chance to look at 6.0 codegen? It may be better.

If not, I'll take a look when I get a chance.

huoyaoyuan · 2021-02-05T19:06:40Z

I changed to master branch in sharplab, and the codegen was identical.
Editing the code may crash sharplab.

AndyAyersMS · 2021-02-05T19:26:37Z

I don't think sharplab's master tests 6.0 codegen -- suspect it refers to the roslyn branch, not the runtime branch.

huoyaoyuan · 2021-02-05T19:56:11Z

Historically, sharplab uses master for both roslyn and runtime master.

I've built latest master and viewed with disasmo. The problem still exists.

By the way, o is int || o is uint || o is long generates identical IL with the worst scenario. Roslyn doesn't introduce a new IL pattern for new language feature here.

jnyrup · 2021-02-06T13:58:01Z

Here are two other snippets than generates new asm, at least with SharpLab.

I don't know if they are "better", but I observe that:

M3 has two instructions less than M2.
M4 has two instructions less than M0, but three more than M1.

public bool M3(object o)
{
    return o is not null and (int or uint or long);
}

public bool M4(object o)
{
    return o switch 
    {
        int or uint or long => true,
        _ => false
    };
}

C.M3(System.Object)
    L0000: push ebp
    L0001: mov ebp, esp
    L0003: test edx, edx
    L0005: je short L002a
    L0007: mov eax, edx
    L0009: cmp dword ptr [eax], 0xca73b4
    L000f: je short L0023
    L0011: mov eax, edx
    L0013: cmp dword ptr [eax], 0xca79b8
    L0019: je short L0023
    L001b: cmp dword ptr [edx], 0xca7fbc
    L0021: jne short L002a
    L0023: mov eax, 1
    L0028: jmp short L002c
    L002a: xor eax, eax
    L002c: pop ebp
    L002d: ret

C.M4(System.Object)
    L0000: push ebp
    L0001: mov ebp, esp
    L0003: mov eax, edx
    L0005: test eax, eax
    L0007: je short L0011
    L0009: cmp dword ptr [eax], 0xca73b4
    L000f: jne short L0015
    L0011: test edx, edx
    L0013: jne short L0037
    L0015: mov eax, edx
    L0017: test eax, eax
    L0019: je short L0023
    L001b: cmp dword ptr [eax], 0xca79b8
    L0021: jne short L0027
    L0023: test edx, edx
    L0025: jne short L0037
    L0027: test edx, edx
    L0029: je short L0033
    L002b: cmp dword ptr [edx], 0xca7fbc
    L0031: jne short L003e
    L0033: test edx, edx
    L0035: je short L003e
    L0037: mov eax, 1
    L003c: jmp short L0040
    L003e: xor eax, eax
    L0040: pop ebp
    L0041: ret

sharplab

EgorBo · 2021-02-10T16:45:47Z

Well, even a simple

bool Test(object o) => o is int;

emits a sub-optimal codegen:

; Method Test(System.Object):bool:this
G_M7842_IG01:              ;; offset=0000H
						;; bbWeight=1    PerfScore 0.00

G_M7842_IG02:              ;; offset=0000H
       4885D2               test     rdx, rdx
       7411                 je       SHORT G_M7842_IG05
						;; bbWeight=1    PerfScore 1.25

G_M7842_IG03:              ;; offset=0005H
       48B8A09B3944FF7F0000 mov      rax, 0x7FFF44399BA0
       483902               cmp      qword ptr [rdx], rax
       7402                 je       SHORT G_M7842_IG05
						;; bbWeight=0.25 PerfScore 0.81

G_M7842_IG04:              ;; offset=0014H
       33D2                 xor      rdx, rdx
						;; bbWeight=0.13 PerfScore 0.03

G_M7842_IG05:              ;; offset=0016H
       4885D2               test     rdx, rdx
       0F95C0               setne    al
       0FB6C0               movzx    rax, al
						;; bbWeight=1    PerfScore 1.50

G_M7842_IG06:              ;; offset=001FH
       C3                   ret      
						;; bbWeight=1    PerfScore 1.00
; Total bytes of code: 32

The problem here is the fact we emit a giant tree for the following IL:

  ldarg.1
  isinst Int32
  ldnull
  cgt.un
  ret

After importer:

            [000016] -A-X--------              *  ASG       ref   
            [000015] D------N----              +--*  LCL_VAR   ref    V03 tmp1         
            [000014] ---X--------              \--*  QMARK     ref   
            [000007] Q-----------    if           +--*  EQ        int   
            [000006] ------------                 |  +--*  LCL_VAR   ref    V01 arg1         
            [000005] ------------                 |  \--*  CNS_INT   ref    null
            [000013] ---X--------    if           \--*  COLON     ref   
            [000011] ---X-------- else               +--*  QMARK     ref   
            [000004] Q--X--------    if              |  +--*  NE        int   
            [000003] #--X--------                    |  |  +--*  IND       long  
            [000002] ------------                    |  |  |  \--*  LCL_VAR   ref    V01 arg1         
            [000001] H-----------                    |  |  \--*  CNS_INT(h) long   0x7fff44389ba0 class
            [000010] ------------    if              |  \--*  COLON     ref   
            [000008] ------------ else               |     +--*  LCL_VAR   ref    V01 arg1         
            [000009] ------------ then               |     \--*  CNS_INT   ref    null
            [000012] ------------ then               \--*  LCL_VAR   ref    V01 arg1         

            [000020] ------------              *  RETURN    int   
            [000019] N--------U--              \--*  GT        int   
            [000017] ------------                 +--*  LCL_VAR   ref    V03 tmp1         
            [000018] ------------                 \--*  CNS_INT   ref    null

We emit a huge tree for isinst while not needing its return value (we only compare it with null), we only want something like this:

*  RETURN    int        
\--*  QMARK     int   
  +--*  EQ        int   
  |  +--*  LCL_VAR   ref    V01 arg1         
  |  \--*  CNS_INT   ref    null
  \--*  COLON     int   
    +--*  NE        int   
    |  +--*  IND       long  
    |  |  \--*  LCL_VAR   ref    V01 arg1         
    |  \--*  CNS_INT(h) long   0x7fff44389ba0 class
    \--*  CNS_INT   ref    0(false)

AndyAyersMS · 2021-02-10T16:48:33Z

6.0 codegen for M1 and M2 looks pretty good (better than the sharplab above, which is indeed showing 5.0 codegen).

Main difference from 5.0 is likely the CSE of the method table fetches enabled by #45854. M1 could improve its handling of return values a bit.

; Assembly listing for method C:M1(Object):bool:this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; discarded IBC profile data due to mismatch in ILSize
; Final local variable assignments
;
;* V00 this         [V00    ] (  0,  0   )     ref  ->  zero-ref    this class-hnd
;  V01 arg1         [V01,T00] (  5,  4   )     ref  ->  rdx         class-hnd
;# V02 OutArgs      [V02    ] (  1,  1   )  lclBlk ( 0) [rsp+0x00]   "OutgoingArgSpace"
;  V03 tmp1         [V03,T01] (  2,  2   )     ref  ->  rax         class-hnd "spilling QMark2"
;* V04 tmp2         [V04,T04] (  0,  0   )     ref  ->  zero-ref    class-hnd "spilling QMark2"
;  V05 tmp3         [V05,T02] (  3,  2.25)     ref  ->  rdx         class-hnd "spilling QMark2"
;  V06 cse0         [V06,T03] (  4,  2   )    long  ->  rax         "CSE - moderate"
;
; Lcl frame size = 0

G_M18147_IG01:
						;; bbWeight=1    PerfScore 0.00
G_M18147_IG02:
       test     rdx, rdx
       je       SHORT G_M18147_IG09
						;; bbWeight=1    PerfScore 1.25
G_M18147_IG03:
       mov      rax, rdx
       mov      rax, qword ptr [rax]
       mov      rcx, 0xD1FFAB1E
       cmp      rax, rcx
       je       SHORT G_M18147_IG07
       mov      rcx, 0xD1FFAB1E
       cmp      rax, rcx
       je       SHORT G_M18147_IG07
       mov      rcx, 0xD1FFAB1E
       cmp      rax, rcx
       je       SHORT G_M18147_IG05
						;; bbWeight=0.50 PerfScore 3.38
G_M18147_IG04:
       xor      rdx, rdx
						;; bbWeight=0.12 PerfScore 0.03
G_M18147_IG05:
       test     rdx, rdx
       setne    al
       movzx    rax, al
						;; bbWeight=0.50 PerfScore 0.75
G_M18147_IG06:
       ret      
						;; bbWeight=0.50 PerfScore 0.50
G_M18147_IG07:
       mov      eax, 1
						;; bbWeight=0.50 PerfScore 0.12
G_M18147_IG08:
       ret      
						;; bbWeight=0.50 PerfScore 0.50
G_M18147_IG09:
       xor      eax, eax
						;; bbWeight=0.50 PerfScore 0.12
G_M18147_IG10:
       ret      
						;; bbWeight=0.50 PerfScore 0.50

; Assembly listing for method C:M2(Object):bool:this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; discarded IBC profile data due to mismatch in ILSize
; Final local variable assignments
;
;* V00 this         [V00    ] (  0,  0   )     ref  ->  zero-ref    this class-hnd
;  V01 arg1         [V01,T00] (  4,  3.50)     ref  ->  rdx         class-hnd
;# V02 OutArgs      [V02    ] (  1,  1   )  lclBlk ( 0) [rsp+0x00]   "OutgoingArgSpace"
;  V03 cse0         [V03,T01] (  4,  2   )    long  ->  rax         "CSE - aggressive"
;
; Lcl frame size = 0

G_M2592_IG01:
						;; bbWeight=1    PerfScore 0.00
G_M2592_IG02:
       test     rdx, rdx
       je       SHORT G_M2592_IG07
						;; bbWeight=1    PerfScore 1.25
G_M2592_IG03:
       mov      rax, qword ptr [rdx]
       mov      rdx, 0xD1FFAB1E
       cmp      rax, rdx
       je       SHORT G_M2592_IG05
       mov      rdx, 0xD1FFAB1E
       cmp      rax, rdx
       je       SHORT G_M2592_IG05
       mov      rdx, 0xD1FFAB1E
       cmp      rax, rdx
       sete     al
       movzx    rax, al
						;; bbWeight=0.50 PerfScore 3.38
G_M2592_IG04:
       ret      
						;; bbWeight=0.50 PerfScore 0.50
G_M2592_IG05:
       mov      eax, 1
						;; bbWeight=0.50 PerfScore 0.12
G_M2592_IG06:
       ret      
						;; bbWeight=0.50 PerfScore 0.50
G_M2592_IG07:
       xor      eax, eax
						;; bbWeight=0.50 PerfScore 0.12
G_M2592_IG08:
       ret      
						;; bbWeight=0.50 PerfScore 0.50

AndyAyersMS · 2021-02-10T17:03:37Z

However M0 is still not where it could be.

; Assembly listing for method C:M0(Object):bool:this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; discarded IBC profile data due to mismatch in ILSize
; Final local variable assignments
;
;* V00 this         [V00    ] (  0,  0   )     ref  ->  zero-ref    this class-hnd
;  V01 arg1         [V01,T00] (  7,  5   )     ref  ->  rdx         class-hnd
;# V02 OutArgs      [V02    ] (  1,  1   )  lclBlk ( 0) [rsp+0x00]   "OutgoingArgSpace"
;  V03 tmp1         [V03,T01] (  3,  4.50)     ref  ->  rax         class-hnd "spilling QMark2"
;  V04 tmp2         [V04,T03] (  3,  2.50)     ref  ->  rax         class-hnd "spilling QMark2"
;  V05 tmp3         [V05,T02] (  5,  3.75)     ref  ->  rdx         class-hnd "spilling QMark2"
;
; Lcl frame size = 0

G_M57506_IG01:
						;; bbWeight=1    PerfScore 0.00
G_M57506_IG02:
       mov      rax, rdx
       test     rax, rax
       je       SHORT G_M57506_IG04
						;; bbWeight=1    PerfScore 1.50
G_M57506_IG03:
       mov      rcx, 0xD1FFAB1E
       cmp      qword ptr [rax], rcx
       jne      SHORT G_M57506_IG05
						;; bbWeight=0.25 PerfScore 0.81
G_M57506_IG04:
       test     rdx, rdx
       jne      SHORT G_M57506_IG13
						;; bbWeight=0.50 PerfScore 0.62
G_M57506_IG05:
       mov      rax, rdx
       test     rax, rax
       je       SHORT G_M57506_IG07
						;; bbWeight=0.50 PerfScore 0.75
G_M57506_IG06:
       mov      rcx, 0xD1FFAB1E
       cmp      qword ptr [rax], rcx
       jne      SHORT G_M57506_IG08
						;; bbWeight=0.25 PerfScore 0.81
G_M57506_IG07:
       test     rdx, rdx
       jne      SHORT G_M57506_IG13
						;; bbWeight=0.50 PerfScore 0.62
G_M57506_IG08:
       test     rdx, rdx
       je       SHORT G_M57506_IG11
						;; bbWeight=0.50 PerfScore 0.62
G_M57506_IG09:
       mov      rax, 0xD1FFAB1E
       cmp      qword ptr [rdx], rax
       je       SHORT G_M57506_IG11
						;; bbWeight=0.25 PerfScore 0.81
G_M57506_IG10:
       xor      rdx, rdx
						;; bbWeight=0.12 PerfScore 0.03
G_M57506_IG11:
       test     rdx, rdx
       setne    al
       movzx    rax, al
						;; bbWeight=0.50 PerfScore 0.75
G_M57506_IG12:
       ret      
						;; bbWeight=0.50 PerfScore 0.50
G_M57506_IG13:
       mov      eax, 1
						;; bbWeight=0.50 PerfScore 0.12
G_M57506_IG14:
       ret      
						;; bbWeight=0.50 PerfScore 0.50

The jump-threading optimizations introduced in #46257 bail out as there is a temp assign (and hence side effect) in the block threading would like to avoid:

Dominator BB01 of BB06 has relop with same liberal VN:
N003 (  3,  3) [000007] J------N----              *  EQ        int    $100
N001 (  1,  1) [000006] ------------              +--*  LCL_VAR   ref    V03 tmp1         u:1 $80
N002 (  1,  1) [000005] ------------              \--*  CNS_INT   ref    null $VN.Null
 Redundant compare; current relop:
N003 (  3,  3) [000030] J------N----              *  EQ        int    $100
N001 (  1,  1) [000029] ------------              +--*  LCL_VAR   ref    V04 tmp2         u:1 $80
N002 (  1,  1) [000028] ------------              \--*  CNS_INT   ref    null $VN.Null
Both successors of IDom BB01 reach BB06 -- attempting jump threading
BB06 has side effects; no threading

 ------------ BB06 [008..???) -> BB10 (cond), preds={BB04,BB05} succs={BB07,BB10}

***** BB06
STMT00013 (IL 0x008...  ???)
N003 (  5,  4) [000075] -A------R---              *  ASG       ref    $80
N002 (  3,  2) [000074] D------N----              +--*  LCL_VAR   ref    V04 tmp2         d:1 $80
N001 (  1,  1) [000035] ------?-----              \--*  LCL_VAR   ref    V01 arg1         u:1 $80

***** BB06
STMT00011 (IL 0x008...  ???)
N004 (  5,  5) [000072] ------------              *  JTRUE     void  
N003 (  3,  3) [000030] J------N----              \--*  EQ        int    $100
N001 (  1,  1) [000029] ------------                 +--*  LCL_VAR   ref    V04 tmp2         u:1 $80
N002 (  1,  1) [000028] ------------                 \--*  CNS_INT   ref    null $VN.Null

This temp assign is consequence of the expansion of qmarks done by morph in fgExpandQmarkForCastInstOf.

At any rate we can / should extend RBO to allow duplication of the side effecting code. I didn't do this initially because:

if the redundant test block has PHIs then it seems we really ought to do some sensible updating of SSA, and we're not well equipped to do that;
we would need to introduce a new block for one of the threading out comes so that we only need to create one copy (that is, currently for a redundant test block T we group predecessors into two camps -- true preds and false preds -- and in general for both camps we simply jump to the new intended target. Instead one camp would need to jump to a clone of T and the other could continue jumping to T (like we do for fall through preds).

I don't see us getting to this in 6.0 given other priorities, though this sort of chained type test pattern is likely somewhat common (and more of it is on the way from PGO induced GDV) so we may find time to squeeze it in.

AndyAyersMS · 2021-02-10T17:12:43Z

For completeness, here's the 6.0 codegen for M3 and M4.

M3 looks good and benefits from #45854. M4 has the same temp assign blocking issue as noted for M0.

; Assembly listing for method C:M3(Object):bool:this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; discarded IBC profile data due to mismatch in ILSize
; Final local variable assignments
;
;* V00 this         [V00    ] (  0,  0   )     ref  ->  zero-ref    this class-hnd
;  V01 arg1         [V01,T00] (  4,  3.50)     ref  ->  rdx         class-hnd
;  V02 loc0         [V02,T03] (  3,  2   )    bool  ->  rax        
;# V03 OutArgs      [V03    ] (  1,  1   )  lclBlk ( 0) [rsp+0x00]   "OutgoingArgSpace"
;  V04 tmp1         [V04,T01] (  2,  2   )     ref  ->  rdx         class-hnd "spilling QMark2"
;* V05 tmp2         [V05,T04] (  0,  0   )     ref  ->  zero-ref    class-hnd "spilling QMark2"
;* V06 tmp3         [V06,T05] (  0,  0   )     ref  ->  zero-ref    class-hnd "spilling QMark2"
;  V07 cse0         [V07,T02] (  4,  2   )    long  ->  rax         "CSE - moderate"
;
; Lcl frame size = 0

G_M3425_IG01:
						;; bbWeight=1    PerfScore 0.00
G_M3425_IG02:
       test     rdx, rdx
       je       SHORT G_M3425_IG05
						;; bbWeight=1    PerfScore 1.25
G_M3425_IG03:
       mov      rax, qword ptr [rdx]
       mov      rdx, 0xD1FFAB1E
       cmp      rax, rdx
       je       SHORT G_M3425_IG04
       mov      rdx, 0xD1FFAB1E
       cmp      rax, rdx
       je       SHORT G_M3425_IG04
       mov      rdx, 0xD1FFAB1E
       cmp      rax, rdx
       jne      SHORT G_M3425_IG05
						;; bbWeight=0.50 PerfScore 3.25
G_M3425_IG04:
       mov      eax, 1
       jmp      SHORT G_M3425_IG06
						;; bbWeight=0.50 PerfScore 1.12
G_M3425_IG05:
       xor      eax, eax
						;; bbWeight=0.50 PerfScore 0.12
G_M3425_IG06:
       ret      
						;; bbWeight=1    PerfScore 1.00

; Assembly listing for method C:M4(Object):bool:this
; Emitting BLENDED_CODE for X64 CPU with AVX - Windows
; optimized code
; rsp based frame
; partially interruptible
; discarded IBC profile data due to mismatch in ILSize
; Final local variable assignments
;
;* V00 this         [V00    ] (  0,  0   )     ref  ->  zero-ref    this class-hnd
;  V01 arg1         [V01,T00] (  7,  5   )     ref  ->  rdx         class-hnd
;  V02 loc0         [V02,T04] (  3,  2   )    bool  ->  rax        
;# V03 OutArgs      [V03    ] (  1,  1   )  lclBlk ( 0) [rsp+0x00]   "OutgoingArgSpace"
;  V04 tmp1         [V04,T01] (  3,  4.50)     ref  ->  rax         class-hnd "spilling QMark2"
;  V05 tmp2         [V05,T02] (  3,  2.50)     ref  ->  rax         class-hnd "spilling QMark2"
;  V06 tmp3         [V06,T03] (  3,  2.50)     ref  ->  rdx         class-hnd "spilling QMark2"
;
; Lcl frame size = 0

G_M53926_IG01:
						;; bbWeight=1    PerfScore 0.00
G_M53926_IG02:
       mov      rax, rdx
       test     rax, rax
       je       SHORT G_M53926_IG04
						;; bbWeight=1    PerfScore 1.50
G_M53926_IG03:
       mov      rcx, 0xD1FFAB1E
       cmp      qword ptr [rax], rcx
       jne      SHORT G_M53926_IG05
						;; bbWeight=0.25 PerfScore 0.81
G_M53926_IG04:
       test     rdx, rdx
       jne      SHORT G_M53926_IG10
						;; bbWeight=0.50 PerfScore 0.62
G_M53926_IG05:
       mov      rax, rdx
       test     rax, rax
       je       SHORT G_M53926_IG07
						;; bbWeight=0.50 PerfScore 0.75
G_M53926_IG06:
       mov      rcx, 0xD1FFAB1E
       cmp      qword ptr [rax], rcx
       jne      SHORT G_M53926_IG08
						;; bbWeight=0.25 PerfScore 0.81
G_M53926_IG07:
       test     rdx, rdx
       jne      SHORT G_M53926_IG10
						;; bbWeight=0.50 PerfScore 0.62
G_M53926_IG08:
       test     rdx, rdx
       je       SHORT G_M53926_IG11
						;; bbWeight=0.50 PerfScore 0.62
G_M53926_IG09:
       mov      rax, 0xD1FFAB1E
       cmp      qword ptr [rdx], rax
       jne      SHORT G_M53926_IG11
						;; bbWeight=0.25 PerfScore 0.81
G_M53926_IG10:
       mov      eax, 1
       jmp      SHORT G_M53926_IG12
						;; bbWeight=0.50 PerfScore 1.12
G_M53926_IG11:
       xor      eax, eax
						;; bbWeight=0.50 PerfScore 0.12
G_M53926_IG12:
       ret      
						;; bbWeight=1    PerfScore 1.00

EgorBo · 2021-02-10T17:59:10Z

@AndyAyersMS Do you think it makes sense to special case

isinst
ldnull
cgt.un

in the importer, something like this EgorBo@6659f53 or it just should be handled via #48115?

AndyAyersMS · 2021-02-10T18:52:33Z

Handling it early is a good idea. The less IR/flow we introduce early, the better.

huoyaoyuan · 2021-02-11T04:00:23Z

Is the pattern

isinst
brtrue.s / brfalse.s

also handled? It looks to be common too.

AndyAyersMS · 2022-10-01T02:53:38Z

Need to test these out with #76476.

huoyaoyuan · 2023-06-23T11:35:17Z

I've investigated myself, and successfully optimized isinst+ldnull+ceq/cgt.un and isinst+brtrue/brfalse patterns locally. However, I have no idea how to handle the isinst+dup+x pattern, which is used by as followed with ?. or ??.

The hard part here is fgExpandQmarkForCastInstOf was written in mind with as operator only. I have no knowledge about how qmark nodes can participate in further optimization either.

I also tried to introduce a new managed helper for doing the x is Exact or x as Exact check, but found that JIT doesn't allow helper call to be inlined currently.

@EgorBo do you have any suggestions around these?

EgorBo · 2023-06-23T11:52:06Z

@EgorBo do you have any suggestions around these?

Not really, ?? and other cases often produce multi-use boxing that we can't pattern match here

but found that JIT doesn't allow helper call to be inlined currently.

it can be implemented, just a bit of work + new JIT-EE API

huoyaoyuan · 2023-06-23T18:12:35Z

it can be implemented, just a bit of work + new JIT-EE API

I'd like to try it. What information need to be passed in the interface? Eagerly load and pass the body of helper method?

AndyAyersMS · 2024-06-14T02:38:44Z

All of M0-M4 now produce very similar code with 9p5 ish bits (M2 has slightly different layout)

; Method C:M4(System.Object):ubyte:this (FullOpts)
G_M53340_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00

G_M53340_IG02:  ;; offset=0x0000
       test     rdx, rdx
       je       SHORT G_M53340_IG05
						;; size=5 bbWeight=1 PerfScore 1.25

G_M53340_IG03:  ;; offset=0x0005
       mov      rax, qword ptr [rdx]
       mov      rcx, 0x7FFE205A5A10      ; System.Int32
       cmp      rax, rcx
       je       SHORT G_M53340_IG04
       mov      rcx, 0x7FFE2062CA58      ; System.UInt32
       cmp      rax, rcx
       je       SHORT G_M53340_IG04
       mov      rcx, 0x7FFE206613C0      ; System.Int64
       cmp      rax, rcx
       jne      SHORT G_M53340_IG05
						;; size=48 bbWeight=0.25 PerfScore 1.62

G_M53340_IG04:  ;; offset=0x0035
       mov      eax, 1
       jmp      SHORT G_M53340_IG06
						;; size=7 bbWeight=0.50 PerfScore 1.12

G_M53340_IG05:  ;; offset=0x003C
       xor      eax, eax
						;; size=2 bbWeight=0.50 PerfScore 0.12

G_M53340_IG06:  ;; offset=0x003E
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 63

Egor's Test example however still has some convoluted flow

; Method C:Test(System.Object):ubyte:this (FullOpts)
G_M56083_IG01:  ;; offset=0x0000
						;; size=0 bbWeight=1 PerfScore 0.00

G_M56083_IG02:  ;; offset=0x0000
       test     rdx, rdx
       je       SHORT G_M56083_IG05
						;; size=5 bbWeight=1 PerfScore 1.25

G_M56083_IG03:  ;; offset=0x0005
       mov      rax, 0x7FFE20595A10      ; System.Int32
       cmp      qword ptr [rdx], rax
       jne      SHORT G_M56083_IG05
						;; size=15 bbWeight=0.25 PerfScore 1.06

G_M56083_IG04:  ;; offset=0x0014
       jmp      SHORT G_M56083_IG06
						;; size=2 bbWeight=0.12 PerfScore 0.25

G_M56083_IG05:  ;; offset=0x0016
       xor      rdx, rdx
						;; size=2 bbWeight=0.25 PerfScore 0.06

G_M56083_IG06:  ;; offset=0x0018
       test     rdx, rdx
       setne    al
       movzx    rax, al
						;; size=9 bbWeight=1 PerfScore 1.50

G_M56083_IG07:  ;; offset=0x0021
       ret      
						;; size=1 bbWeight=1 PerfScore 1.00
; Total bytes of code: 34

But looks like #103391 will address that.

huoyaoyuan added the tenet-performance Performance related issue label Feb 5, 2021

dotnet-issue-labeler bot added area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI untriaged New issue has not been triaged by the area owner labels Feb 5, 2021

JulieLeeMSFT assigned AndyAyersMS Feb 5, 2021

JulieLeeMSFT added needs-further-triage Issue has been initially triaged, but needs deeper consideration or reconsideration and removed untriaged New issue has not been triaged by the area owner labels Feb 5, 2021

AndyAyersMS removed the needs-further-triage Issue has been initially triaged, but needs deeper consideration or reconsideration label Feb 10, 2021

AndyAyersMS added this to the Future milestone Feb 10, 2021

AndyAyersMS mentioned this issue Feb 10, 2021

Redundant Branch Opts Enhancements #48115

Open

huoyaoyuan mentioned this issue Jun 22, 2021

Import cgt.un(op, 0) as NE(op, 0) #54539

Merged

EgorBo mentioned this issue Jul 16, 2021

Improve codegen for delegate cast / type check? #55841

Closed

EgorBo mentioned this issue Sep 29, 2022

JIT: optimize redundant branches by looking through phis #76283

Merged

huoyaoyuan mentioned this issue Jun 8, 2023

Import isinst/castclass Nullable as underlying type #87241

Merged

huoyaoyuan mentioned this issue Sep 14, 2023

Optimize unboxing in System.Data.Common.ObjectStorage:Set #91945

Closed

huoyaoyuan mentioned this issue Dec 7, 2023

Remove boxing from "value is S" for nullables #95711

Merged

huoyaoyuan mentioned this issue May 8, 2024

Convert OAVariant interop to managed #100176

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Ineffective codegen for or pattern type checking #47920

Ineffective codegen for or pattern type checking #47920

huoyaoyuan commented Feb 5, 2021 •

edited by AndyAyersMS

Loading

AndyAyersMS commented Feb 5, 2021

huoyaoyuan commented Feb 5, 2021

AndyAyersMS commented Feb 5, 2021

huoyaoyuan commented Feb 5, 2021

jnyrup commented Feb 6, 2021

EgorBo commented Feb 10, 2021 •

edited

Loading

AndyAyersMS commented Feb 10, 2021

AndyAyersMS commented Feb 10, 2021

AndyAyersMS commented Feb 10, 2021

EgorBo commented Feb 10, 2021

AndyAyersMS commented Feb 10, 2021

huoyaoyuan commented Feb 11, 2021

AndyAyersMS commented Oct 1, 2022

huoyaoyuan commented Jun 23, 2023

EgorBo commented Jun 23, 2023

huoyaoyuan commented Jun 23, 2023

AndyAyersMS commented Jun 14, 2024

Ineffective codegen for or pattern type checking #47920

Ineffective codegen for or pattern type checking #47920

Comments

huoyaoyuan commented Feb 5, 2021 • edited by AndyAyersMS Loading

Description

Configuration

Regression?

AndyAyersMS commented Feb 5, 2021

huoyaoyuan commented Feb 5, 2021

AndyAyersMS commented Feb 5, 2021

huoyaoyuan commented Feb 5, 2021

jnyrup commented Feb 6, 2021

EgorBo commented Feb 10, 2021 • edited Loading

AndyAyersMS commented Feb 10, 2021

AndyAyersMS commented Feb 10, 2021

AndyAyersMS commented Feb 10, 2021

EgorBo commented Feb 10, 2021

AndyAyersMS commented Feb 10, 2021

huoyaoyuan commented Feb 11, 2021

AndyAyersMS commented Oct 1, 2022

huoyaoyuan commented Jun 23, 2023

EgorBo commented Jun 23, 2023

huoyaoyuan commented Jun 23, 2023

AndyAyersMS commented Jun 14, 2024

huoyaoyuan commented Feb 5, 2021 •

edited by AndyAyersMS

Loading

EgorBo commented Feb 10, 2021 •

edited

Loading